def test_trustworthiness(input_type, n_samples, n_features, n_components): centers = round(n_samples * 0.4) X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features) X_embedded = UMAP(n_components=n_components).fit_transform(X) X = X.astype(np.float32) X_embedded = X_embedded.astype(np.float32) if input_type == 'dataframe': gdf = cudf.DataFrame() for i in range(X.shape[1]): gdf[str(i)] = np.asarray(X[:, i], dtype=np.float32) gdf_embedded = cudf.DataFrame() for i in range(X_embedded.shape[1]): gdf_embedded[str(i)] = np.asarray(X_embedded[:, i], dtype=np.float32) cu_score = cuml_trustworthiness(gdf, gdf_embedded) else: cu_score = cuml_trustworthiness(X, X_embedded) sk_score = sklearn_trustworthiness(X, X_embedded) eps = 0.0001 assert (sk_score * (1 - eps) <= cu_score and cu_score <= sk_score * (1 + eps))
def _local_umap_trustworthiness(local_X, local_y, n_neighbors, supervised): """ Train model on all data, report trustworthiness """ from cuml.manifold import UMAP local_model = UMAP(n_neighbors=n_neighbors, random_state=42) y_train = None if supervised: y_train = local_y local_model.fit(local_X, y=y_train) embedding = local_model.transform(local_X) return trustworthiness(local_X, embedding, n_neighbors=n_neighbors, batch_size=5000)
def reduce_dimensionality(self, embeddings): """Reduce dimensionality of embeddings using UMAP and train a UMAP model Args: embeddings (cupy.ndarray): The extracted embeddings using the sentence transformer module. Returns: umap_embeddings: The reduced embeddings """ m_cos = NearestNeighbors(n_neighbors=15, metric="cosine") m_cos.fit(embeddings) knn_graph_cos = m_cos.kneighbors_graph(embeddings, mode="distance") u1 = UMAP(n_neighbors=15, n_components=5, min_dist=0.0) umap_embeddings = u1.fit_transform(embeddings, knn_graph=knn_graph_cos) return umap_embeddings
def test_internals_api(n_components): callback = CustomCallback() reducer = UMAP(n_components=n_components, callback=callback) reducer.fit(data) callback.check() # Make sure super().__init__ is called callback = CustomCallback(skip_init=True) model = UMAP(n_epochs=10, callback=callback) with pytest.raises(ValueError): model.fit_transform(data)
def _umap_mnmg_trustworthiness(local_X, local_y, n_neighbors, supervised, n_parts, sampling_ratio): """ Train model on random sample of data, transform in parallel, report trustworthiness """ import dask.array as da from cuml.dask.manifold import UMAP as MNMG_UMAP from cuml.manifold import UMAP local_model = UMAP(n_neighbors=n_neighbors, random_state=42) n_samples = local_X.shape[0] n_samples_per_part = math.ceil(n_samples / n_parts) selection = np.random.RandomState(42).choice( [True, False], n_samples, replace=True, p=[sampling_ratio, 1.0 - sampling_ratio]) X_train = local_X[selection] X_transform = local_X X_transform_d = da.from_array(X_transform, chunks=(n_samples_per_part, -1)) y_train = None if supervised: y_train = local_y[selection] local_model.fit(X_train, y=y_train) distributed_model = MNMG_UMAP(local_model) embedding = distributed_model.transform(X_transform_d) embedding = embedding.compute() return trustworthiness(X_transform, embedding, n_neighbors=n_neighbors, batch_size=5000)
def umap_mnmg_trustworthiness(): n_samples = local_X.shape[0] n_sampling = int(n_samples * sampling_ratio) n_samples_per_part = int(n_samples / n_parts) local_model = UMAP(n_neighbors=n_neighbors) selection = np.random.choice(n_samples, n_sampling) X_train = local_X[selection] X_transform = local_X[~selection] X_transform_d = da.from_array(X_transform, chunks=(n_samples_per_part, -1)) y_train = None if supervised: y_train = local_y[selection] local_model.fit(X_train, y=y_train) distributed_model = MNMG_UMAP(local_model) embedding = distributed_model.transform(X_transform_d) embedding = cp.asnumpy(embedding.compute()) return trustworthiness(X_transform, embedding, n_neighbors)
def _build_mnmg_umap(m, data, args, tmpdir): client = args['client'] del args['client'] local_model = UMAP(**args) if isinstance(data, (tuple, list)): local_data = [x.compute() for x in data if x is not None] if len(local_data) == 2: X, y = local_data local_model.fit(X, y) else: X = local_data local_model.fit(X) return m(client=client, model=local_model, **args)
def local_umap_trustworthiness(): local_model = UMAP(n_neighbors=n_neighbors) local_model.fit(local_X, local_y) embedding = local_model.transform(local_X) return trustworthiness(local_X, embedding, n_neighbors)
def test_internals_api(n_components): callback = CustomCallback() reducer = UMAP(n_components=n_components, callback=callback) reducer.fit(data) callback.check()
from cuml.manifold import UMAP from sklearn.datasets import load_digits from numba import cuda from pyrr import Matrix44 import numpy as np import cudatashader as ds from cudatashader import transfer_functions as tf from cudatashader.colors import Hot from IPython.core.display import display, HTML, clear_output digits = load_digits() data, target_classes = digits.data, digits.target n_samples = target_classes.shape[0] reducer = UMAP(n_components=3) reducer.fit(data) embedding = reducer.transform(data) maxThreadsPerBlock = cuda.get_current_device().MAX_THREADS_PER_BLOCK @cuda.jit('void(int64[:], float64[:,:])') def fill_agg_value(target_classes, result): i = cuda.grid(1) result[i, 2] = target_classes[i] @cuda.jit('void(float64[:,:], float64[:,:], float64[:,:])') def apply_projection(MVP, embedding, result): i = cuda.grid(1) x, y, z = embedding[i, 0], embedding[i, 1], embedding[i, 2]
data, target = digits.data, digits.target class CustomCallback(GraphBasedDimRedCallback): preprocess_event, epoch_event, train_event = False, 0, False def check(self): assert (self.preprocess_event) assert (self.epoch_event > 10) assert (self.train_event) def on_preprocess_end(self, embeddings): self.preprocess_event = True def on_epoch_end(self, embeddings): self.epoch_event += 1 def on_train_end(self, embeddings): self.train_event = True reducer = UMAP(n_components=2, callback=CustomCallback()) @pytest.mark.parametrize('n_components', [2, 4, 8]) def test_internals_api(n_components): callback = CustomCallback() reducer = UMAP(n_components=n_components, callback=callback) reducer.fit(data) callback.check()