示例#1
0
def test_trustworthiness(input_type, n_samples, n_features, n_components):
    centers = round(n_samples * 0.4)
    X, y = make_blobs(n_samples=n_samples,
                      centers=centers,
                      n_features=n_features)

    X_embedded = UMAP(n_components=n_components).fit_transform(X)
    X = X.astype(np.float32)
    X_embedded = X_embedded.astype(np.float32)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        for i in range(X.shape[1]):
            gdf[str(i)] = np.asarray(X[:, i], dtype=np.float32)

        gdf_embedded = cudf.DataFrame()
        for i in range(X_embedded.shape[1]):
            gdf_embedded[str(i)] = np.asarray(X_embedded[:, i],
                                              dtype=np.float32)

        cu_score = cuml_trustworthiness(gdf, gdf_embedded)
    else:
        cu_score = cuml_trustworthiness(X, X_embedded)

    sk_score = sklearn_trustworthiness(X, X_embedded)

    eps = 0.0001
    assert (sk_score * (1 - eps) <= cu_score
            and cu_score <= sk_score * (1 + eps))
示例#2
0
def _local_umap_trustworthiness(local_X, local_y, n_neighbors, supervised):
    """
    Train model on all data, report trustworthiness
    """
    from cuml.manifold import UMAP

    local_model = UMAP(n_neighbors=n_neighbors, random_state=42)
    y_train = None
    if supervised:
        y_train = local_y
    local_model.fit(local_X, y=y_train)
    embedding = local_model.transform(local_X)
    return trustworthiness(local_X,
                           embedding,
                           n_neighbors=n_neighbors,
                           batch_size=5000)
示例#3
0
    def reduce_dimensionality(self, embeddings):
        """Reduce dimensionality of embeddings using UMAP and train a UMAP model

        Args:
            embeddings (cupy.ndarray): The extracted embeddings using the
            sentence transformer module.

        Returns:
            umap_embeddings: The reduced embeddings
        """
        m_cos = NearestNeighbors(n_neighbors=15, metric="cosine")
        m_cos.fit(embeddings)
        knn_graph_cos = m_cos.kneighbors_graph(embeddings, mode="distance")
        u1 = UMAP(n_neighbors=15, n_components=5, min_dist=0.0)
        umap_embeddings = u1.fit_transform(embeddings, knn_graph=knn_graph_cos)

        return umap_embeddings
示例#4
0
def test_internals_api(n_components):
    callback = CustomCallback()
    reducer = UMAP(n_components=n_components, callback=callback)
    reducer.fit(data)
    callback.check()

    # Make sure super().__init__ is called
    callback = CustomCallback(skip_init=True)
    model = UMAP(n_epochs=10, callback=callback)

    with pytest.raises(ValueError):
        model.fit_transform(data)
示例#5
0
def _umap_mnmg_trustworthiness(local_X, local_y, n_neighbors, supervised,
                               n_parts, sampling_ratio):
    """
    Train model on random sample of data, transform in
    parallel, report trustworthiness
    """
    import dask.array as da
    from cuml.dask.manifold import UMAP as MNMG_UMAP

    from cuml.manifold import UMAP

    local_model = UMAP(n_neighbors=n_neighbors, random_state=42)

    n_samples = local_X.shape[0]
    n_samples_per_part = math.ceil(n_samples / n_parts)

    selection = np.random.RandomState(42).choice(
        [True, False],
        n_samples,
        replace=True,
        p=[sampling_ratio, 1.0 - sampling_ratio])
    X_train = local_X[selection]
    X_transform = local_X
    X_transform_d = da.from_array(X_transform, chunks=(n_samples_per_part, -1))

    y_train = None
    if supervised:
        y_train = local_y[selection]

    local_model.fit(X_train, y=y_train)

    distributed_model = MNMG_UMAP(local_model)
    embedding = distributed_model.transform(X_transform_d)

    embedding = embedding.compute()
    return trustworthiness(X_transform,
                           embedding,
                           n_neighbors=n_neighbors,
                           batch_size=5000)
示例#6
0
        def umap_mnmg_trustworthiness():
            n_samples = local_X.shape[0]
            n_sampling = int(n_samples * sampling_ratio)
            n_samples_per_part = int(n_samples / n_parts)

            local_model = UMAP(n_neighbors=n_neighbors)

            selection = np.random.choice(n_samples, n_sampling)
            X_train = local_X[selection]
            X_transform = local_X[~selection]
            X_transform_d = da.from_array(X_transform,
                                          chunks=(n_samples_per_part, -1))

            y_train = None
            if supervised:
                y_train = local_y[selection]

            local_model.fit(X_train, y=y_train)

            distributed_model = MNMG_UMAP(local_model)
            embedding = distributed_model.transform(X_transform_d)

            embedding = cp.asnumpy(embedding.compute())
            return trustworthiness(X_transform, embedding, n_neighbors)
示例#7
0
def _build_mnmg_umap(m, data, args, tmpdir):
    client = args['client']
    del args['client']
    local_model = UMAP(**args)

    if isinstance(data, (tuple, list)):
        local_data = [x.compute() for x in data if x is not None]
    if len(local_data) == 2:
        X, y = local_data
        local_model.fit(X, y)
    else:
        X = local_data
        local_model.fit(X)

    return m(client=client, model=local_model, **args)
示例#8
0
 def local_umap_trustworthiness():
     local_model = UMAP(n_neighbors=n_neighbors)
     local_model.fit(local_X, local_y)
     embedding = local_model.transform(local_X)
     return trustworthiness(local_X, embedding, n_neighbors)
示例#9
0
def test_internals_api(n_components):
    callback = CustomCallback()
    reducer = UMAP(n_components=n_components, callback=callback)
    reducer.fit(data)
    callback.check()
示例#10
0
from cuml.manifold import UMAP
from sklearn.datasets import load_digits
from numba import cuda
from pyrr import Matrix44
import numpy as np
import cudatashader as ds
from cudatashader import transfer_functions as tf
from cudatashader.colors import Hot
from IPython.core.display import display, HTML, clear_output

digits = load_digits()
data, target_classes = digits.data, digits.target
n_samples = target_classes.shape[0]

reducer = UMAP(n_components=3)
reducer.fit(data)
embedding = reducer.transform(data)

maxThreadsPerBlock = cuda.get_current_device().MAX_THREADS_PER_BLOCK


@cuda.jit('void(int64[:], float64[:,:])')
def fill_agg_value(target_classes, result):
    i = cuda.grid(1)
    result[i, 2] = target_classes[i]


@cuda.jit('void(float64[:,:], float64[:,:], float64[:,:])')
def apply_projection(MVP, embedding, result):
    i = cuda.grid(1)
    x, y, z = embedding[i, 0], embedding[i, 1], embedding[i, 2]
示例#11
0
data, target = digits.data, digits.target


class CustomCallback(GraphBasedDimRedCallback):
    preprocess_event, epoch_event, train_event = False, 0, False

    def check(self):
        assert (self.preprocess_event)
        assert (self.epoch_event > 10)
        assert (self.train_event)

    def on_preprocess_end(self, embeddings):
        self.preprocess_event = True

    def on_epoch_end(self, embeddings):
        self.epoch_event += 1

    def on_train_end(self, embeddings):
        self.train_event = True


reducer = UMAP(n_components=2, callback=CustomCallback())


@pytest.mark.parametrize('n_components', [2, 4, 8])
def test_internals_api(n_components):
    callback = CustomCallback()
    reducer = UMAP(n_components=n_components, callback=callback)
    reducer.fit(data)
    callback.check()