def _conv_array_to_sparse(arr): """ Converts an array (or cudf.DataFrame) to a sparse array :param arr: scipy or cupy sparse matrix, cudf DataFrame, dense numpy or cupy array :return: cupy sparse CSR matrix """ if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix if scipy_sparse_isspmatrix(arr): ret = \ cupyx.scipy.sparse.csr_matrix(arr.tocsr()) elif cupyx.scipy.sparse.isspmatrix(arr): ret = arr elif isinstance(arr, cudf.DataFrame): ret = _conv_df_to_sparse(arr) elif isinstance(arr, np.ndarray): cupy_ary = rmm_cupy_ary(cp.asarray, arr, dtype=arr.dtype) ret = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(arr, cp.core.core.ndarray): ret = cupyx.scipy.sparse.csr_matrix(arr) else: raise ValueError("Unexpected input type %s" % type(arr)) return ret
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype, algo): if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(algorithm=algo) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.to_numpy() else: assert isinstance(neigh_ind, cp.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def inverse_transform(self, y, threshold=None): """ Transform binary labels back to original multi-class labels Parameters ---------- y : array of shape [n_samples, n_classes] threshold : float this value is currently ignored Returns ------- arr : array with original labels """ if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # If we are already given multi-class, just return it. if cupyx.scipy.sparse.isspmatrix(y): y_mapped = y.tocsr().indices.astype(self._classes_.dtype) elif scipy_sparse_isspmatrix(y): y = y.tocsr() y_mapped = rmm_cupy_ary(cp.array, y.indices, dtype=y.indices.dtype) else: y_mapped = rmm_cupy_ary(cp.argmax, rmm_cupy_ary(cp.asarray, y, dtype=y.dtype), axis=1).astype(y.dtype) return invert_labels(y_mapped, self._classes_)
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype): if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) X = X.astype(np.float32) if datatype == "dataframe": X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X)) knn_cu = cuKNN() knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host() else: assert isinstance(neigh_ind, np.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype, algo): if algo == "ivfpq": pytest.xfail("""See Memory access error in IVFPQ : https://github.com/rapidsai/cuml/issues/3318""") if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(algorithm=algo) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host() else: assert isinstance(neigh_ind, cp.core.core.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_random_projection_fit_transform(datatype, method): if has_scipy(): from scipy.spatial.distance import pdist else: pytest.skip('Skipping test_random_projection_fit_transform because ' + 'Scipy is missing') eps = 0.2 # dataset generation data, target = make_blobs(n_samples=800, centers=400, n_features=3000) # conversion to input_type data = data.astype(datatype) target = target.astype(datatype) # creation of model if method == 'gaussian': model = GaussianRandomProjection(eps=eps) else: model = SparseRandomProjection(eps=eps) # fitting the model model.fit(data) # applying transformation transformed_data = model.transform(data) original_pdist = pdist(data) embedded_pdist = pdist(transformed_data) # check JL lemma assert (np.all(((1.0 - eps) * original_pdist) <= embedded_pdist) and np.all(embedded_pdist <= ((1.0 + eps) * original_pdist)))
def predict(neigh_ind, _y, n_neighbors): if has_scipy(): import scipy.stats as stats else: raise RuntimeError('Scipy is needed to run predict()') neigh_ind = neigh_ind.astype(np.int64) ypred, count = stats.mode(_y[neigh_ind], axis=1) return ypred.ravel(), count.ravel() * 1.0 / n_neighbors
def test_self_neighboring(datatype, metric_p, nrows): """Test that searches using an indexed vector itself return sensible results for that vector For L2-derived metrics, this specifically exercises the slow high-precision mode used to correct for approximation errors in L2 computation during NN searches. """ ncols = 1000 n_clusters = 10 n_neighbors = 3 metric, p = metric_p if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(metric=metric, n_neighbors=n_neighbors) knn_cu.fit(X) neigh_dist, neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=True, two_pass_precision=True) if datatype == 'dataframe': assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.to_numpy() neigh_dist = neigh_dist.to_numpy() else: assert isinstance(neigh_ind, cp.ndarray) neigh_ind = neigh_ind.get() neigh_dist = neigh_dist.get() neigh_ind = neigh_ind[:, 0] neigh_dist = neigh_dist[:, 0] assert_array_equal( neigh_ind, np.arange(0, neigh_dist.shape[0]), ) assert_allclose(neigh_dist, np.zeros(neigh_dist.shape, dtype=neigh_dist.dtype), atol=1e-4)
def test_entropy_random(n_samples, base, use_handle): if has_scipy(): from scipy.stats import entropy as sp_entropy else: pytest.skip('Skipping test_entropy_random because Scipy is missing') handle, stream = get_handle(use_handle) clustering, _, _, _ = \ generate_random_labels(lambda rng: rng.randint(0, 1000, n_samples)) # generate unormalized probabilities from clustering pk = np.bincount(clustering) # scipy's entropy uses probabilities sp_S = sp_entropy(pk, base=base) # we use a clustering S = entropy(np.array(clustering, dtype=np.int32), base, handle=handle) assert_almost_equal(S, sp_S, decimal=2)
def test_basic_functions(labels, dtype, sparse_output): fit_labels, xform_labels = labels skl_bin = skLB(sparse_output=sparse_output) skl_bin.fit(fit_labels) fit_labels = cp.asarray(fit_labels, dtype=dtype) xform_labels = cp.asarray(xform_labels, dtype=dtype) binarizer = LabelBinarizer(sparse_output=sparse_output) binarizer.fit(fit_labels) assert array_equal(binarizer.classes_.get(), np.unique(fit_labels.get())) xformed = binarizer.transform(xform_labels) if sparse_output: skl_bin_xformed = skl_bin.transform(xform_labels.get()) if has_scipy(): import scipy.sparse else: pytest.skip('Skipping test_basic_functions(sparse_output=True) ' + 'because Scipy is missing') skl_csr = scipy.sparse.coo_matrix(skl_bin_xformed).tocsr() cuml_csr = xformed array_equal(skl_csr.data, cuml_csr.data.get()) # #todo: Support sparse inputs # xformed = xformed.todense().astype(dtype) assert xformed.shape[1] == binarizer.classes_.shape[0] original = binarizer.inverse_transform(xformed) assert array_equal(original.get(), xform_labels.get())
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype, algo): if algo == "ivfpq": pytest.xfail("Warning: IVFPQ might be unstable in this " "version of cuML. This is due to a known issue " "in the FAISS release that this cuML version " "is linked to. (see FAISS issue #1421)") if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(algorithm=algo) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host() else: assert isinstance(neigh_ind, cp.core.core.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
sklearn_metrics = set(sklearn.neighbors.VALID_METRICS_SPARSE[algo]) sklearn_metrics.update(sklearn.neighbors.VALID_METRICS[algo]) return [value for value in cuml_metrics if value in sklearn_metrics] def metric_p_combinations(): for metric in valid_metrics(): yield metric, 2 if metric in ("minkowski", "lp"): yield metric, 3 @pytest.mark.parametrize("datatype", ["dataframe", "numpy"]) @pytest.mark.parametrize("metric_p", metric_p_combinations()) @pytest.mark.parametrize("nrows", [1000, stress_param(10000)]) @pytest.mark.skipif(not has_scipy(), reason="Skipping test_self_neighboring" " because Scipy is missing") def test_self_neighboring(datatype, metric_p, nrows): """Test that searches using an indexed vector itself return sensible results for that vector For L2-derived metrics, this specifically exercises the slow high-precision mode used to correct for approximation errors in L2 computation during NN searches. """ ncols = 1000 n_clusters = 10 n_neighbors = 3 metric, p = metric_p
def batched_fmin_lbfgs_b(func, x0, num_batches, fprime=None, args=(), bounds=None, m=10, factr=1e7, pgtol=1e-5, epsilon=1e-8, iprint=-1, maxiter=15000, maxls=20): """A batch-aware L-BFGS-B implementation to minimize a loss function `f` given an initial set of parameters `x0`. Parameters ---------- func : function (x: array) -> array[M] (M = n_batches) The function to minimize. The function should return an array of size = `num_batches` x0 : array Starting parameters fprime : function (x: array) -> array[M*n_params] (optional) The gradient. Should return an array of derivatives for each parameter over batches. When omitted, uses Finite-differencing to estimate the gradient. args : Tuple Additional arguments to func and fprime bounds : List[Tuple[float, float]] Box-constrains on the parameters m : int L-BFGS parameter: number of previous arrays to store when estimating inverse Hessian. factr : float Stopping criterion when function evaluation not progressing. Stop when `|f(xk+1) - f(xk)| < factor*eps_mach` where `eps_mach` is the machine precision pgtol : float Stopping criterion when gradient is sufficiently "flat". Stop when |grad| < pgtol. epsilon : float Finite differencing step size when approximating `fprime` iprint : int -1 for no diagnostic info n=1-100 for diagnostic info every n steps. >100 for detailed diagnostic info maxiter : int Maximum number of L-BFGS iterations maxls : int Maximum number of line-search iterations. """ if has_scipy(): from scipy.optimize import _lbfgsb else: raise RuntimeError("Scipy is needed to run batched_fmin_lbfgs_b") nvtx_range_push("LBFGS") n = len(x0) // num_batches if fprime is None: def fprime_f(x): return _fd_fprime(x, func, epsilon) fprime = fprime_f if bounds is None: bounds = [(None, None)] * n nbd = np.zeros(n, np.int32) low_bnd = np.zeros(n, np.float64) upper_bnd = np.zeros(n, np.float64) bounds_map = {(None, None): 0, (1, None): 1, (1, 1): 2, (None, 1): 3} for i in range(0, n): lb, ub = bounds[i] if lb is not None: low_bnd[i] = lb lb = 1 if ub is not None: upper_bnd[i] = ub ub = 1 nbd[i] = bounds_map[lb, ub] # working arrays needed by L-BFGS-B implementation in SciPy. # One for each series x = [np.copy(np.array(x0[ib*n:(ib+1)*n], np.float64)) for ib in range(num_batches)] f = [np.copy(np.array(0.0, np.float64)) for ib in range(num_batches)] g = [np.copy(np.zeros((n,), np.float64)) for ib in range(num_batches)] wa = [np.copy(np.zeros(2*m*n + 5*n + 11*m*m + 8*m, np.float64)) for ib in range(num_batches)] iwa = [np.copy(np.zeros(3*n, np.int32)) for ib in range(num_batches)] task = [np.copy(np.zeros(1, 'S60')) for ib in range(num_batches)] csave = [np.copy(np.zeros(1, 'S60')) for ib in range(num_batches)] lsave = [np.copy(np.zeros(4, np.int32)) for ib in range(num_batches)] isave = [np.copy(np.zeros(44, np.int32)) for ib in range(num_batches)] dsave = [np.copy(np.zeros(29, np.float64)) for ib in range(num_batches)] for ib in range(num_batches): task[ib][:] = 'START' n_iterations = np.zeros(num_batches, dtype=np.int32) converged = num_batches * [False] warn_flag = np.zeros(num_batches) while not all(converged): nvtx_range_push("LBFGS-ITERATION") for ib in range(num_batches): if converged[ib]: continue _lbfgsb.setulb(m, x[ib], low_bnd, upper_bnd, nbd, f[ib], g[ib], factr, pgtol, wa[ib], iwa[ib], task[ib], iprint, csave[ib], lsave[ib], isave[ib], dsave[ib], maxls) xk = np.concatenate(x) fk = func(xk) gk = fprime(xk) for ib in range(num_batches): if converged[ib]: continue task_str = task[ib].tostring() task_str_strip = task[ib].tostring().strip(b'\x00').strip() if task_str.startswith(b'FG'): # needs function evalation f[ib] = fk[ib] g[ib] = gk[ib*n:(ib+1)*n] elif task_str.startswith(b'NEW_X'): n_iterations[ib] += 1 if n_iterations[ib] >= maxiter: task[ib][:] = 'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT' elif task_str_strip.startswith(b'CONV'): converged[ib] = True warn_flag[ib] = 0 else: converged[ib] = True warn_flag[ib] = 2 continue nvtx_range_pop() xk = np.concatenate(x) if iprint > 0: logger.info("CONVERGED in ({}-{}) iterations (|\\/f|={})".format( np.min(n_iterations), np.max(n_iterations), np.linalg.norm(fprime(xk), np.inf))) if (warn_flag > 0).any(): for ib in range(num_batches): if warn_flag[ib] > 0: logger.info("WARNING: id={} convergence issue: {}".format( ib, task[ib].tostring())) nvtx_range_pop() return xk, n_iterations, warn_flag