def fit(self, y): """ Fit label binarizer Parameters ---------- y : array of shape [n_samples,] or [n_samples, n_classes] Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ self._set_output_type(y) if y.ndim > 2: raise ValueError("labels cannot be greater than 2 dimensions") if y.ndim == 2: unique_classes = cp.unique(y) if unique_classes != [0, 1]: raise ValueError("2-d array can must be binary") self._classes_ = CumlArray(cp.arange(0, y.shape[1])) else: self._classes_ = CumlArray(cp.unique(y).astype(y.dtype)) cp.cuda.Stream.null.synchronize() return self
def test_stratified_binary_classification(): X = cp.array([[0.37487513, -2.3031888, 1.662633, 0.7671007], [-0.49796826, -1.0621182, -0.32518214, -0.20583323], [-1.0104885, -2.4997945, 2.8952584, 1.4712684], [2.008748, -2.4520662, 0.5557737, 0.07749569], [0.97350526, -0.3403474, -0.58081895, -0.23199573]]) # Needs to fail when we have just 1 occurence of a label y = cp.array([0, 0, 0, 0, 1]) with pytest.raises(ValueError): train_test_split(X, y, train_size=0.75, stratify=y, shuffle=True) y = cp.array([0, 0, 0, 1, 1]) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, stratify=y, random_state=15) _, y_counts = cp.unique(y, return_counts=True) _, train_counts = cp.unique(y_train, return_counts=True) _, test_counts = cp.unique(y_test, return_counts=True) # Ensure we have preserve the number of labels cp.testing.assert_array_equal(train_counts + test_counts, y_counts)
def sorted_unique_labels(*ys): """Extract an ordered array of unique labels from one or more dask arrays of labels.""" ys = (cp.unique(y.map_blocks(lambda x: cp.unique(x)).compute()) for y in ys) labels = cp.unique(cp.concatenate(ys)) return labels
def _ray_fit_preprocess(self, y) -> Callable: """This has been separated out so that it can be easily overwritten should a future xgboost version remove label encoding""" # pylint: disable = attribute-defined-outside-init,too-many-statements can_use_label_encoder = True use_label_encoder = getattr(self, "use_label_encoder", True) label_encoding_check_error = ( "The label must consist of integer " "labels of form 0, 1, 2, ..., [num_class - 1].") label_encoder_deprecation_msg = ( "The use of label encoder in XGBClassifier is deprecated and will " "be removed in a future release. To remove this warning, do the " "following: 1) Pass option use_label_encoder=False when " "constructing XGBClassifier object; and 2) Encode your labels (y) " "as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].") # ray: modified this to allow for compatibility with legacy xgboost if (_is_cudf_df and _is_cudf_df(y)) or (_is_cudf_ser and _is_cudf_ser(y)): import cupy as cp # pylint: disable=E0401 self.classes_ = cp.unique(y.values) self.n_classes_ = len(self.classes_) can_use_label_encoder = False expected_classes = cp.arange(self.n_classes_) if (self.classes_.shape != expected_classes.shape or not (self.classes_ == expected_classes).all()): raise ValueError(label_encoding_check_error) elif (_is_cupy_array and _is_cupy_array(y)): import cupy as cp # pylint: disable=E0401 self.classes_ = cp.unique(y) self.n_classes_ = len(self.classes_) can_use_label_encoder = False expected_classes = cp.arange(self.n_classes_) if (self.classes_.shape != expected_classes.shape or not (self.classes_ == expected_classes).all()): raise ValueError(label_encoding_check_error) else: self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) if not use_label_encoder and (not np.array_equal( self.classes_, np.arange(self.n_classes_))): raise ValueError(label_encoding_check_error) if use_label_encoder: if not can_use_label_encoder: raise ValueError( "The option use_label_encoder=True is incompatible with " "inputs of type cuDF or cuPy. Please set " "use_label_encoder=False when constructing XGBClassifier " "object. NOTE:" + label_encoder_deprecation_msg) if hasattr(self, "use_label_encoder"): warnings.warn(label_encoder_deprecation_msg, UserWarning) self._le = XGBoostLabelEncoder().fit(y) label_transform = self._le.transform else: label_transform = lambda x: x # noqa: E731 return label_transform
def setdiff1d(ar1, ar2, assume_unique=False): """Find the set difference of two arrays. It returns unique values in `ar1` that are not in `ar2`. Parameters ---------- ar1 : cupy.ndarray Input array ar2 : cupy.ndarray Input array for comparision assume_unique : bool By default, False, i.e. input arrays are not unique. If True, input arrays are assumed to be unique. This can speed up the calculation. Returns ------- setdiff1d : cupy.ndarray Returns a 1D array of values in `ar1` that are not in `ar2`. It always returns a sorted output for unsorted input only if `assume_unique=False`. See Also -------- numpy.setdiff1d """ if assume_unique: ar1 = cupy.ravel(ar1) else: ar1 = cupy.unique(ar1) ar2 = cupy.unique(ar2) return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
def setxor1d(ar1, ar2, assume_unique=False): """Find the set exclusive-or of two arrays. Parameters ---------- ar1, ar2 : cupy.ndarray Input arrays. They are flattend if they are not already 1-D. assume_unique : bool By default, False, i.e. input arrays are not unique. If True, input arrays are assumed to be unique. This can speed up the calculation. Returns ------- setxor1d : cupy.ndarray Return the sorted, unique values that are in only one (not both) of the input arrays. See Also -------- numpy.setxor1d """ if not assume_unique: ar1 = cupy.unique(ar1) ar2 = cupy.unique(ar2) aux = cupy.concatenate((ar1, ar2), axis=None) if aux.size == 0: return aux aux.sort() return aux[_setxorkernel(aux, aux.size, cupy.zeros(aux.size, dtype=cupy.bool_))]
def many_to_one_GPU(ar1, ar2): '''Based on np.intersect1d. Special fn for LJ cc gen''' import cupy as cp ar1 = cp.asarray(ar1) # ar1 to GPU ar1, ind1, inv1 = cp.unique(ar1, return_index=True, return_inverse=True) # ar1, ind1, inv1 on GPU ar1 = cp.asnumpy(ar1) # ar1 to Host ind1 = cp.asnumpy(ind1) # ind1 to Host inv1 = cp.asnumpy(inv1) # inv1 to Host ar2 = cp.asarray(ar2) # ar2 to GPU ar2, ind2 = cp.unique(ar2, return_index=True) # ar2, ind2 on GPU ar2 = cp.asnumpy(ar2) # ar2 to Host ind2 = cp.asnumpy(ind2) # ind2 to Host aux = np.concatenate((ar1, ar2)) # aux on Host aux_sort_indices = np.argsort(aux, kind='mergesort') # aux_sort_indices on Host aux = aux[aux_sort_indices] mask = aux[1:] == aux[:-1] # mask on Host ar2_indices = ind2[aux_sort_indices[1:][mask] - ar1.size] # ar2_indices on Host return ar2_indices[inv1] # return on Host
def intersect1d_GPU(ar1, ar2, assume_unique=False, return_indices=False): '''Based on np.intersect1d. Special fn for LJ cc gen''' import cupy as cp assert (not assume_unique) and return_indices ar1 = cp.asarray(ar1) # ar1 to GPU ar1, ind1 = cp.unique(ar1, return_index=True) # ar1, ind1 on GPU ar1 = cp.asnumpy(ar1) # ar1 to Host ind1 = cp.asnumpy(ind1) # ind1 to Host ar2 = cp.asarray(ar2) # ar2 to GPU ar2, ind2 = cp.unique(ar2, return_index=True) # ar2, ind2 on GPU ar2 = cp.asnumpy(ar2) # ar2 to Host ind2 = cp.asnumpy(ind2) # ind2 to Host aux = np.concatenate((ar1, ar2)) # aux on Host aux_sort_indices = np.argsort(aux, kind='mergesort') # aux_sort_indices on Host aux = aux[aux_sort_indices] mask = aux[1:] == aux[:-1] # mask on Host int1d = aux[:-1][mask] # int1d on Host ar1_indices = aux_sort_indices[:-1][mask] # ar1_indices on Host ar2_indices = aux_sort_indices[1:][mask] - ar1.size # ar2_indices on Host ar1_indices = ind1[ar1_indices] ar2_indices = ind2[ar2_indices] return int1d, ar1_indices, ar2_indices # return on Host
def test_make_blobs_scalar_parameters(dtype, n_samples, n_features, centers, cluster_std, center_box, shuffle, random_state, order): out, labels = cuml.make_blobs(dtype=dtype, n_samples=n_samples, n_features=n_features, centers=centers, cluster_std=0.001, center_box=center_box, shuffle=shuffle, random_state=random_state, order=order) assert out.shape == (n_samples, n_features), "out shape mismatch" assert labels.shape == (n_samples, ), "labels shape mismatch" if order == 'F': assert out.flags['F_CONTIGUOUS'] elif order == 'C': assert out.flags['C_CONTIGUOUS'] if centers is None: assert cp.unique(labels).shape == (3,), \ "unexpected number of clusters" elif centers <= n_samples: assert cp.unique(labels).shape == (centers,), \ "unexpected number of clusters"
def confusion_matrix(client, y_true, y_pred, normalize=None, sample_weight=None): from cuml.dask.common.input_utils import DistributedDataHandler unique_classes = cp.unique(y_true.map_blocks(lambda x: cp.unique(x)).compute()) nclasses = len(unique_classes) ddh = DistributedDataHandler.create([y_true, y_pred]) cms = client.compute( [ client.submit( local_cm, part, unique_classes, sample_weight, workers=[worker] ) for worker, part in ddh.gpu_futures ], sync=True, ) cm = cp.zeros((nclasses, nclasses)) for i in cms: cm += i with np.errstate(all="ignore"): if normalize == "true": cm = cm / cm.sum(axis=1, keepdims=True) elif normalize == "pred": cm = cm / cm.sum(axis=0, keepdims=True) elif normalize == "all": cm = cm / cm.sum() cm = cp.nan_to_num(cm) return cm
def intersect1d(arr1, arr2, assume_unique=False, return_indices=False): """Find the intersection of two arrays. Returns the sorted, unique values that are in both of the input arrays. Parameters ---------- arr1, arr2 : cupy.ndarray Input arrays. Arrays will be flattened if they are not in 1D. assume_unique : bool By default, False. If set True, the input arrays will be assumend to be unique, which speeds up the calculation. If set True, but the arrays are not unique, incorrect results and out-of-bounds indices could result. return_indices : bool By default, False. If True, the indices which correspond to the intersection of the two arrays are returned. Returns ------- intersect1d : cupy.ndarray Sorted 1D array of common and unique elements. comm1 : cupy.ndarray The indices of the first occurrences of the common values in `arr1`. Only provided if `return_indices` is True. comm2 : cupy.ndarray The indices of the first occurrences of the common values in `arr2`. Only provided if `return_indices` is True. See Also -------- numpy.intersect1d """ if not assume_unique: if return_indices: arr1, ind1 = cupy.unique(arr1, return_index=True) arr2, ind2 = cupy.unique(arr2, return_index=True) else: arr1 = cupy.unique(arr1) arr2 = cupy.unique(arr2) else: arr1 = arr1.ravel() arr2 = arr2.ravel() if not return_indices: mask = _search._exists_kernel(arr1, arr2, arr2.size, False) return arr1[mask] mask, v1 = _search._exists_and_searchsorted_kernel(arr1, arr2, arr2.size, False) int1d = arr1[mask] arr1_indices = cupy.flatnonzero(mask) arr2_indices = v1[mask] if not assume_unique: arr1_indices = ind1[arr1_indices] arr2_indices = ind2[arr2_indices] return int1d, arr1_indices, arr2_indices
def make_classification_dataset(datatype, nrows, ncols, nclasses): n_real_features = min(ncols, int(max(nclasses * 2, math.ceil(ncols / 10)))) n_clusters_per_class = min(2, max(1, int(2**n_real_features / nclasses))) n_redundant = min(ncols - n_real_features, max(2, math.ceil(ncols / 20))) try: X, y = data.make_classification( dtype=datatype, n_samples=nrows + 1000, n_features=ncols, random_state=SEED, class_sep=1.0, n_informative=n_real_features, n_clusters_per_class=n_clusters_per_class, n_redundant=n_redundant, n_classes=nclasses) r = dsel.train_test_split(X, y, random_state=SEED, train_size=nrows) if len(cp.unique(r[2])) < nclasses: raise ValueError("Training data does not have all classes.") return r except ValueError: pytest.skip( "Skipping the test for invalid combination of ncols/nclasses")
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, order, client): c = client nrows = int(nrows) X, y = make_blobs(nrows, ncols, centers=centers, cluster_std=cluster_std, dtype=dtype, n_parts=nparts, order=order, client=client) assert len(X.chunks[0]) == nparts assert len(y.chunks[0]) == nparts assert X.shape == (nrows, ncols) assert y.shape == (nrows, ) y_local = y.compute() assert len(cp.unique(y_local)) == centers X_ddh = DistributedDataHandler.create(data=X, client=c) X_first = X_ddh.gpu_futures[0][1].result() if order == 'F': assert X_first.flags['F_CONTIGUOUS'] elif order == 'C': assert X_first.flags['C_CONTIGUOUS']
def _binary_roc_auc_score(y_true, y_score): """Compute binary roc_auc_score using cupy""" if cp.unique(y_true).shape[0] == 1: raise ValueError("roc_auc_score cannot be used when " "only one class present in y_true. ROC AUC score " "is not defined in that case.") if cp.unique(y_score).shape[0] == 1: return 0.5 fps, tps, thresholds = _binary_clf_curve(y_true, y_score) tpr = tps / tps[-1] fpr = fps / fps[-1] return _calculate_area_under_curve(fpr, tpr).item()
def test_make_classification(n_samples, n_features, hypercube, n_classes, n_clusters_per_class, n_informative, random_state, n_parts, order, dtype, client): from cuml.dask.datasets.classification import make_classification X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, hypercube=hypercube, n_clusters_per_class=n_clusters_per_class, n_informative=n_informative, random_state=random_state, n_parts=n_parts, order=order, dtype=dtype) assert(len(X.chunks[0])) == n_parts assert(len(X.chunks[1])) == 1 assert X.shape == (n_samples, n_features) assert y.shape == (n_samples, ) assert X.dtype == dtype assert y.dtype == np.int64 assert len(X.chunks[0]) == n_parts assert len(y.chunks[0]) == n_parts import cupy as cp y_local = y.compute() assert len(cp.unique(y_local)) == n_classes X_parts = client.sync(_extract_partitions, X) X_first = X_parts[0][1].result() if order == 'F': assert X_first.flags['F_CONTIGUOUS'] elif order == 'C': assert X_first.flags['C_CONTIGUOUS']
def test_map_array_incorrect_output_shape(): labels = cp.random.randint(0, 5, size=(24, 25)) out = cp.empty((24, 24)) in_values = cp.unique(labels) out_values = cp.random.random(in_values.shape).astype(out.dtype) with pytest.raises(ValueError): map_array(labels, in_values, out_values, out=out)
def _build_laplacian(data, spacing, mask, beta, multichannel): l_x, l_y, l_z = data.shape[:3] edges = _make_graph_edges_3d(l_x, l_y, l_z) weights = _compute_weights_3d(data, spacing, beta=beta, eps=1.e-10, multichannel=multichannel) assert weights.dtype == data.dtype if mask is not None: # Remove edges of the graph connected to masked nodes, as well # as corresponding weights of the edges. mask0 = cp.concatenate([mask[..., :-1].ravel(), mask[:, :-1].ravel(), mask[:-1].ravel()]) mask1 = cp.concatenate([mask[..., 1:].ravel(), mask[:, 1:].ravel(), mask[1:].ravel()]) ind_mask = cp.logical_and(mask0, mask1) edges, weights = edges[:, ind_mask], weights[ind_mask] # Reassign edges labels to 0, 1, ... edges_number - 1 _, inv_idx = cp.unique(edges, return_inverse=True) edges = inv_idx.reshape(edges.shape) # Build the sparse linear system pixel_nb = l_x * l_y * l_z i_indices = edges.ravel() j_indices = edges[::-1].ravel() data = cp.concatenate((weights, weights)) lap = sparse.coo_matrix((data, (i_indices, j_indices)), shape=(pixel_nb, pixel_nb)) # need CSR instead of COO for indexing used later in _build_linear_system lap = lap.tocsr() lap.setdiag(-cp.ravel(lap.sum(axis=0))) return lap
def clusterAverage(clu, spikeQuantity): # get the average of some quantity across spikes in each cluster, given the # quantity for each spike # # e.g. # > clusterDepths = clusterAverage(clu, spikeDepths) # # clu and spikeQuantity must be vector, same size # # using a super-tricky algorithm for this - when you make a sparse # array, the values of any duplicate indices are added. So this is the # fastest way I know to make the sum of the entries of spikeQuantity for each of # the unique entries of clu _, cluInds, spikeCounts = cp.unique(clu, return_inverse=True, return_counts=True) # summation q = cpx.scipy.sparse.coo_matrix( (spikeQuantity, (cluInds, cp.zeros(len(clu))))).toarray().flatten() # had sums so dividing by spike counts gives the mean depth of each cluster clusterQuantity = q / spikeCounts return clusterQuantity
def get_states_numbers(self): """ For all possible states, return the number of agents in the map in this state returns a numpy array consisting in 2 columns: the first is the state id and the second, the number of agents currently in this state on the map """ state_ids, n_agents = cp.unique(self.current_state_ids, return_counts=True) return state_ids, n_agents
def _label2rgb_avg(label_field, image, bg_label=0, bg_color=(0, 0, 0)): """Visualise each segment in `label_field` with its mean color in `image`. Parameters ---------- label_field : array of int A segmentation of an image. image : array, shape ``label_field.shape + (3,)`` A color image of the same spatial shape as `label_field`. bg_label : int, optional A value in `label_field` to be treated as background. bg_color : 3-tuple of int, optional The color for the background label Returns ------- out : array, same shape and type as `image` The output visualization. """ out = cp.zeros(label_field.shape + (3, )) labels = cp.unique(label_field) bg = labels == bg_label if bg.any(): labels = labels[labels != bg_label] mask = (label_field == bg_label).nonzero() out[mask] = bg_color for label in labels: mask = (label_field == label).nonzero() color = image[mask].mean(axis=0) out[mask] = color return out
def test_map_array_non_contiguous_output_array(): labels = cp.random.randint(0, 5, size=(24, 25)) out = cp.empty((24 * 3, 25 * 2))[::3, ::2] in_values = cp.unique(labels) out_values = cp.random.random(in_values.shape).astype(out.dtype) with pytest.raises(ValueError): map_array(labels, in_values, out_values, out=out)
def _match_label_with_color(label, colors, bg_label, bg_color): """Return `unique_labels` and `color_cycle` for label array and color list. Colors are cycled for normal labels, but the background color should only be used for the background. """ # Temporarily set background color; it will be removed later. if bg_color is None: bg_color = (0, 0, 0) bg_color = _rgb_vector(bg_color) # map labels to their ranks among all labels from small to large unique_labels, mapped_labels = cp.unique(label, return_inverse=True) # get rank of bg_label # for CuPy use .ravel() instead of .flat bg_label_rank_list = mapped_labels[label.ravel() == bg_label] # The rank of each label is the index of the color it is matched to in # color cycle. bg_label should always be mapped to the first color, so # its rank must be 0. Other labels should be ranked from small to large # from 1. if len(bg_label_rank_list) > 0: bg_label_rank = bg_label_rank_list[0] mapped_labels[mapped_labels < bg_label_rank] += 1 mapped_labels[label.ravel() == bg_label] = 0 else: mapped_labels += 1 # Modify labels and color cycle so background color is used only once. color_cycle = itertools.cycle(colors) color_cycle = itertools.chain([bg_color], color_cycle) return mapped_labels, color_cycle
def remove_indices(self): """make feature vector `self.features`""" # 0<=v,d1,d2,d3<=28, 0<=e<=28*28=784, so v,d1,d2,d3 has 10**2 spaces and e has 10**3 spaces. if not self.use_d: features = cp.vstack((self.num_vertices, self.num_edges)) features = features[0] + features[1] * (10**2) # ve else: features = cp.vstack((self.num_vertices, self.num_edges, self.num_id1, self.num_id2, self.num_id3)) features = features[0] + features[1] * (10**2) + features[2] * ( 10** (2 + 3)) + features[3] * (10**(2 + 3 + 2)) + features[4] * ( 10**(2 + 3 + 2 + 2)) # veid1id2id3 self.features = cp.unique(features, return_counts=True) self.prob_of_measuring_0ket = cp.sum( self.features[1]**2) / (2**(2 * self.adjacency_mat.shape[0])) divide_value = np.sqrt(cp.sum(self.features[1]**2)) self.normalized_features = (self.features[0], self.features[1] / divide_value) del self.indices del self.num_vertices del self.num_edges if self.use_d: del self.num_id1 del self.num_id2 del self.num_id3 self.features = (cp.asnumpy(self.features[0]), cp.asnumpy(self.features[1])) self.normalized_features = (cp.asnumpy(self.normalized_features[0]), cp.asnumpy(self.normalized_features[1]))
def _csr_column_index1(col_idxs, Ap, Aj): """Construct indptr and components for populating indices and data of output sparse array Args col_idxs : column indices to index from input indices Ap : indptr of input sparse matrix Aj : indices of input sparse matrix Returns Bp : indptr of output sparse matrix Aj_mask : Input indices array with all cols not matching the index index masked out with -1. col_counts : Number of times each unique index occurs in Aj sort_idxs : Indices sorted to preserve original order of idxs """ idx_map, sort_idxs = cupy.unique(col_idxs, return_index=True) sort_idxs = sort_idxs.astype(idx_map.dtype) idxs = cupy.searchsorted(idx_map, col_idxs) col_counts = cupy.zeros(idx_map.size, dtype=col_idxs.dtype) cupyx.scatter_add(col_counts, idxs, 1) Bp, Aj_mask = _csr_column_index1_indptr(idx_map, sort_idxs, col_counts, Ap, Aj) return Bp, Aj_mask, col_counts, sort_idxs
def _binary_clf_curve(y_true, y_score): if y_true.dtype.kind == 'f' and np.any(y_true != y_true.astype(int)): raise ValueError("Continuous format of y_true " "is not supported.") ids = cp.argsort(-y_score) sorted_score = y_score[ids] ones = y_true[ids].astype('float32') # for calculating true positives zeros = 1 - ones # for calculating predicted positives # calculate groups group = _group_same_scores(sorted_score) num = int(group[-1]) tps = cp.zeros(num, dtype='float32') fps = cp.zeros(num, dtype='float32') tps = _addup_x_in_group(group, ones, tps) fps = _addup_x_in_group(group, zeros, fps) tps = cp.cumsum(tps) fps = cp.cumsum(fps) thresholds = cp.unique(y_score) return fps, tps, thresholds
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster, output): c = Client(cluster) try: from cuml.dask.datasets import make_blobs X, y = make_blobs(nrows, ncols, centers=centers, cluster_std=cluster_std, dtype=dtype, n_parts=nparts, output=output) assert X.npartitions == nparts assert y.npartitions == nparts X = X.compute() y = y.compute() assert X.shape == (nrows, ncols) assert y.shape == (nrows, 1) if output == 'dataframe': assert len(y[0].unique()) == centers assert X.dtypes.unique() == [dtype] elif output == 'array': import cupy as cp assert len(cp.unique(y)) == centers assert y.dtype == dtype finally: c.close()
def shannon_entropy(image, base=2): """Calculate the Shannon entropy of an image. The Shannon entropy is defined as S = -sum(pk * log(pk)), where pk are frequency/probability of pixels of value k. Parameters ---------- image : (N, M) ndarray Grayscale input image. base : float, optional The logarithmic base to use. Returns ------- entropy : 0-dimensional float cupy.ndarray Notes ----- The returned value is measured in bits or shannon (Sh) for base=2, natural unit (nat) for base=np.e and hartley (Hart) for base=10. References ---------- .. [1] `https://en.wikipedia.org/wiki/Entropy_(information_theory) <https://en.wikipedia.org/wiki/Entropy_(information_theory)>`_ .. [2] https://en.wiktionary.org/wiki/Shannon_entropy """ # noqa _, counts = cp.unique(image, return_counts=True) return scipy_entropy(counts, base=base)
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster, order, output): c = Client(cluster) try: X, y = make_blobs(nrows, ncols, centers=centers, cluster_std=cluster_std, dtype=dtype, n_parts=nparts, output=output, order=order) assert X.npartitions == nparts assert y.npartitions == nparts X_local = X.compute() y_local = y.compute() assert X_local.shape == (nrows, ncols) if output == 'dataframe': assert len(y_local[0].unique()) == centers assert X_local.dtypes.unique() == [dtype] assert y_local.shape == (nrows, 1) elif output == 'array': import cupy as cp assert len(cp.unique(y_local)) == centers assert y_local.dtype == dtype assert y_local.shape == (nrows, ) finally: c.close()
def _match_cumulative_cdf(source, template): """ Return modified source array so that the cumulative density function of its values matches the cumulative density function of the template. """ src_values, src_unique_indices, src_counts = cp.unique(source.ravel(), return_inverse=True, return_counts=True) tmpl_values, tmpl_counts = cp.unique(template.ravel(), return_counts=True) # calculate normalized quantiles for each array src_quantiles = cp.cumsum(src_counts) / source.size tmpl_quantiles = cp.cumsum(tmpl_counts) / template.size interp_a_values = cp.interp(src_quantiles, tmpl_quantiles, tmpl_values) return interp_a_values[src_unique_indices].reshape(source.shape)
def _update_infection_probs(self, random_seed=None): """ Updates probability of infection based on how many inhabitants are infectious each city """ if random_seed is None: self._reset_random_seed() else: cp.random.seed(random_seed) infected_indices = self._indices[self._is_infectious] quarantine = self._is_in_quarantine[infected_indices] quarantine = quarantine * (cp.random.random(len(infected_indices)) <= self._quarantine_effifiency) infected_indices = infected_indices[~quarantine] infecious_city_ids = self.city_id[infected_indices] if len(infecious_city_ids) == 0: self._city_infected_counts = cp.zeros(len(self.city_ids)) else: city_ids, infected_counts = cp.unique(infecious_city_ids, return_counts=True) _, self._city_infected_counts = self._sort_by_city_ids( city_ids, infected_counts, as_json=False) self.city_infection_probs = self._city_infected_counts / self._city_population_sizes * \ self._virus.transmission_probability