Exemplo n.º 1
0
def test_n_init_cluster_consistency(random_state):

    nclusters = 8
    X, y = get_data_consistency_test()

    cuml_kmeans = cuml.KMeans(verbose=0,
                              init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)
    initial_clusters = cuml_kmeans.cluster_centers_

    cuml_kmeans = cuml.KMeans(verbose=0,
                              init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)

    assert array_equal(initial_clusters, cuml_kmeans.cluster_centers_)
Exemplo n.º 2
0
def test_n_init_cluster_consistency(random_state):

    cluster_std = 1.0

    nrows = 100000
    ncols = 100
    nclusters = 8

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(verbose=0,
                              init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)
    initial_clusters = cuml_kmeans.cluster_centers_

    cuml_kmeans = cuml.KMeans(verbose=0,
                              init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)

    assert array_equal(initial_clusters, cuml_kmeans.cluster_centers_)
Exemplo n.º 3
0
def test_score(nrows, ncols, nclusters):

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=0.01,
                      random_state=10)

    cuml_kmeans = cuml.KMeans(verbose=1,
                              init="k-means||",
                              n_clusters=nclusters,
                              random_state=10)

    cuml_kmeans.fit(X)

    actual_score = cuml_kmeans.score(X)

    predictions = cuml_kmeans.predict(X)

    centers = cp.array(cuml_kmeans.cluster_centers_.as_gpu_matrix())

    expected_score = 0
    for idx, label in enumerate(predictions):

        x = X[idx]
        y = centers[label]

        dist = np.sqrt(np.sum((x - y)**2))
        expected_score += dist**2

    assert actual_score + SCORE_EPS \
        >= (-1*expected_score) \
        >= actual_score - SCORE_EPS
Exemplo n.º 4
0
def test_rand_index_score(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X)

    cu_score = cu_ars(y, cu_y_pred)
    cu_score_using_sk = sk_ars(y, cu_y_pred)

    assert array_equal(cu_score, cu_score_using_sk)
Exemplo n.º 5
0
def test_weighted_kmeans(nrows, ncols, nclusters, max_weight, random_state):

    # Using fairly high variance between points in clusters
    cluster_std = 1.0
    np.random.seed(random_state)

    # set weight per sample to be from 1 to max_weight
    wt = np.random.randint(1, high=max_weight, size=nrows)

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X, sample_weight=wt)
    cu_score = cuml_kmeans.score(X)

    sk_kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
    sk_kmeans.fit(cp.asnumpy(X), sample_weight=wt)
    sk_score = sk_kmeans.score(cp.asnumpy(X))

    assert abs(cu_score - sk_score) <= cluster_std * 1.5
Exemplo n.º 6
0
def test_kmeans_sequential_plus_plus_init(nrows, ncols, nclusters,
                                          random_state):

    # Using fairly high variance between points in clusters
    cluster_std = 1.0

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(verbose=0,
                              init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)
    cu_score = cuml_kmeans.score(X)

    kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
    kmeans.fit(X.copy_to_host())
    sk_score = kmeans.score(X.copy_to_host())

    assert abs(cu_score - sk_score) <= cluster_std * 1.5
Exemplo n.º 7
0
def test_kmeans_sklearn_comparison_default(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'],
                              random_state=12,
                              n_init=10,
                              output_type='numpy')

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X)
    cu_score = adjusted_rand_score(cu_y_pred, y)
    kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters'])
    sk_y_pred = kmeans.fit_predict(X)
    sk_score = adjusted_rand_score(sk_y_pred, y)

    assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2
Exemplo n.º 8
0
def test_score(nrows, ncols, nclusters):

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=0.01,
                      shuffle=False,
                      random_state=10)

    cuml_kmeans = cuml.KMeans(init="k-means||",
                              n_clusters=nclusters,
                              random_state=10,
                              output_type='numpy')

    cuml_kmeans.fit(X)

    actual_score = cuml_kmeans.score(X)

    predictions = cuml_kmeans.predict(X)

    centers = cuml_kmeans.cluster_centers_

    expected_score = 0
    for idx, label in enumerate(predictions):
        x = X[idx]
        y = cp.array(centers[label])

        dist = cp.sqrt(cp.sum((x - y)**2))
        expected_score += dist**2

    assert actual_score + SCORE_EPS \
        >= (-1*expected_score) \
        >= actual_score - SCORE_EPS
Exemplo n.º 9
0
def test_traditional_kmeans_plus_plus_init(nrows, ncols, nclusters,
                                           random_state):

    # Using fairly high variance between points in clusters
    cluster_std = 1.0

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)
    cu_score = cuml_kmeans.score(X)

    kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
    kmeans.fit(cp.asnumpy(X))
    sk_score = kmeans.score(cp.asnumpy(X))

    assert abs(cu_score - sk_score) <= cluster_std * 1.5
Exemplo n.º 10
0
def test_kmeans_sklearn_comparison_default(name, nrows):

    default_base = {'quantile': .3,
                    'eps': .3,
                    'damping': .9,
                    'preference': -200,
                    'n_neighbors': 10,
                    'n_clusters': 3}

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X)
    cu_score = adjusted_rand_score(cu_y_pred, y)
    kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters'])
    sk_y_pred = kmeans.fit_predict(X)
    sk_score = adjusted_rand_score(sk_y_pred, y)

    # cuML score should be in a close neighborhood around scikit-learn's
    assert sk_score - 0.03 <= cu_score <= sk_score + 0.03
Exemplo n.º 11
0
def test_score(nrows, ncols, nclusters, random_state):

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=1.0,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(init="k-means||",
                              n_clusters=nclusters,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)

    actual_score = cuml_kmeans.score(X)
    predictions = cuml_kmeans.predict(X)

    centers = cuml_kmeans.cluster_centers_

    expected_score = 0.0
    for idx, label in enumerate(predictions):
        x = X[idx, :]
        y = cp.array(centers[label, :], dtype=cp.float32)

        sq_euc_dist = cp.sum(cp.square((x - y)))
        expected_score += sq_euc_dist

    expected_score *= -1

    cp.testing.assert_allclose(actual_score,
                               expected_score,
                               atol=0.1,
                               rtol=1e-5)
Exemplo n.º 12
0
    def re_cluster(self, gdf, new_figerprints=None, new_chembl_ids=None):
        if gdf.shape[0] == 0:
            return None

        # Before reclustering remove all columns that may interfere
        ids = gdf['id']
        chembl_ids = gdf['chembl_id']

        gdf.drop(['x', 'y', 'cluster', 'id', 'chembl_id'], axis=1, inplace=True)
        if new_figerprints is not None and new_chembl_ids is not None:
            # Add new figerprints and chEmblIds before reclustering
            if self.pca:
                new_figerprints = self.pca.transform(new_figerprints)

            if self.enable_gpu:
                fp_df = cudf.DataFrame(new_figerprints, \
                    index=[idx for idx in range(self.orig_df.shape[0], self.orig_df.shape[0] + len(new_figerprints))],
                    columns=gdf.columns)
            else:
                fp_df = pandas.DataFrame(new_figerprints, \
                    index=[idx for idx in range(self.orig_df.shape[0], self.orig_df.shape[0] + len(new_figerprints))],
                    columns=gdf.columns)

            gdf = gdf.append(fp_df, ignore_index=True)
            # Update original dataframe for it to work on reload
            fp_df['id'] = fp_df.index
            self.orig_df = self.orig_df.append(fp_df, ignore_index=True)
            chembl_ids = chembl_ids.append(
                cudf.Series(new_chembl_ids), ignore_index=True)
            ids = ids.append(fp_df['id'], ignore_index=True)
            self.chembl_ids.extend(new_chembl_ids)

            del fp_df

        if self.enable_gpu:
            kmeans_float = cuml.KMeans(n_clusters=self.n_clusters)
        else:
            kmeans_float = sklearn.cluster.KMeans(n_clusters=self.n_clusters)

        kmeans_float.fit(gdf)
        Xt = self.umap.fit_transform(gdf)

        # Add back the column required for plotting and to correlating data
        # between re-clustering
        if self.enable_gpu:
            gdf.add_column('x', Xt[0].to_array())
            gdf.add_column('y', Xt[1].to_array())
            gdf.add_column('cluster', kmeans_float.labels_.to_gpu_array())
            gdf.add_column('chembl_id', chembl_ids)
            gdf.add_column('id', ids)
        else:
            gdf['x'] = Xt[:,0]
            gdf['y'] = Xt[:,1]
            gdf['cluster'] = kmeans_float.labels_
            gdf['chembl_id'] = chembl_ids
            gdf['id'] = ids

        return gdf
Exemplo n.º 13
0
def KMeans(data, cluster):
    warnings.filterwarnings('ignore')
    data_pack = torch.utils.dlpack.to_dlpack(data)
    data_df = cudf.from_dlpack(data_pack)
    model = cuml.KMeans(n_clusters=cluster)
    result = model.fit(data_df)
    labels = torch.utils.dlpack.from_dlpack(result.labels_.to_dlpack())
    warnings.filterwarnings('once')
    return labels
Exemplo n.º 14
0
def test_kmeans_sklearn_comparison(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X).to_array()

    if nrows < 500000:
        kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
        sk_y_pred = kmeans.fit_predict(X)

        # Noisy circles clusters are rotated in the results,
        # since we are comparing 2 we just need to compare that both clusters
        # have approximately the same number of points.
        calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred)
        score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3
        if name == 'noisy_circles':
            assert (calculation < 4e-3) and score_test

        else:
            if name == 'aniso':
                # aniso dataset border points tend to differ in the frontier
                # between clusters when compared to sklearn
                tol = 2e-2
            else:
                # We allow up to 5 points to be different for the other
                # datasets to be robust to small behavior changes
                # between library versions/ small changes. Visually it is
                # very clear that the algorithm work. Will add option
                # to plot if desired in a future version.
                tol = 1e-2
            assert (clusters_equal(
                sk_y_pred, cu_y_pred, params['n_clusters'],
                tol=tol)) and score_test
Exemplo n.º 15
0
    def _query8(self):
        self._loadTables('query8')

        train = {}
        for i in range(12):
            train['c{}'.format(i)] = np.random.rand(1000)
        train = cudf.DataFrame(train)
        kmeans = cuml.KMeans(n_clusters=8)
        kmeans.fit(train)

        rideIndex = self._createIndex(
            self.rideTable,
            'ride.start',
        )

        locationFilter = self.locationTable[
            self.locationTable['loc.locationId'] == 0]
        locationPolygon = self._createBox(
            locationFilter,
            'loc.bounds',
        )

        startTime = time.time()

        (joinRide,
         numbaTime) = self._spatialJoinDist(self.rideTable, locationFilter,
                                            'ride.start', 'loc.bounds',
                                            rideIndex, locationPolygon, 0.0)

        featureName = [
            'ride.c0', 'ride.c1', 'ride.c2', 'ride.c3', 'ride.c4', 'ride.c5',
            'ride.c6', 'ride.c7', 'ride.c8', 'ride.c9', 'ride.c10', 'ride.c11'
        ]

        join0 = joinRide.merge(self.riderTable,
                               left_on='ride.riderId',
                               right_on='rider.riderId')
        groupRider = join0.groupby(['rider.riderId'], )[featureName].mean()

        groupRider['cluster'] = kmeans.predict(groupRider)

        endTime = time.time()

        groupRider.to_csv(
            'query8_gpu.csv',
            index=False,
        )
        return endTime - startTime - numbaTime
Exemplo n.º 16
0
def test_all_kmeans_params(n_rows, n_clusters, max_iter, init,
                           oversampling_factor, max_samples_per_batch):

    np.random.seed(0)
    X = np.random.rand(1000, 10)

    if init == 'preset':
        init = np.random.rand(n_clusters, 10)

    cuml_kmeans = cuml.KMeans(n_clusters=n_clusters,
                              max_iter=max_iter,
                              init=init,
                              oversampling_factor=oversampling_factor,
                              max_samples_per_batch=max_samples_per_batch)

    cuml_kmeans.fit_predict(X)
Exemplo n.º 17
0
def test_kmeans_clusters_blobs(nrows, ncols, nclusters,
                               random_state, cluster_std):

    X, y = make_blobs(int(nrows), ncols, nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=random_state,)

    cuml_kmeans = cuml.KMeans(init="k-means||",
                              n_clusters=nclusters,
                              random_state=random_state,
                              output_type='numpy')

    preds = cuml_kmeans.fit_predict(X)

    assert adjusted_rand_score(cp.asnumpy(preds), cp.asnumpy(y)) >= 0.99
Exemplo n.º 18
0
def test_all_kmeans_params(n_clusters, max_iter, init, oversampling_factor,
                           max_samples_per_batch, random_state):

    np.random.seed(0)
    X = np.random.rand(1000, 10)

    if init == 'preset':
        init = np.random.rand(n_clusters, 10)

    cuml_kmeans = cuml.KMeans(n_clusters=n_clusters,
                              max_iter=max_iter,
                              init=init,
                              random_state=random_state,
                              oversampling_factor=oversampling_factor,
                              max_samples_per_batch=max_samples_per_batch,
                              output_type='cupy')

    cuml_kmeans.fit_predict(X)
Exemplo n.º 19
0
def test_kmeans_sklearn_comparison(name):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, 10000)

    params = default_base.copy()
    params.update(pat[1])

    kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    clustering_algorithms = (
        ('sk_Kmeans', kmeans),
        ('cuml_Kmeans', cuml_kmeans),
    )

    sk_y_pred, _ = fit_predict(clustering_algorithms[0][1],
                               clustering_algorithms[0][0], X)

    cu_y_pred, _ = fit_predict(clustering_algorithms[1][1],
                               clustering_algorithms[1][0], X)

    # Noisy circles clusters are rotated in the results,
    # since we are comparing 2 we just need to compare that both clusters
    # have approximately the same number of points.
    if name == 'noisy_circles':
        assert (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) < 2e-3

    else:
        assert clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])
Exemplo n.º 20
0
def test_kmeans_sklearn_comparison(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X)

    if nrows < 500000:
        kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
        sk_y_pred, _ = fit_predict(kmeans, 'sk_Kmeans', X)

        # Noisy circles clusters are rotated in the results,
        # since we are comparing 2 we just need to compare that both clusters
        # have approximately the same number of points.
        calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred)
        print(cuml_kmeans.score(X), kmeans.score(X))
        score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3
        if name == 'noisy_circles':
            assert (calculation < 2e-3) and score_test

        else:
            assert (clusters_equal(sk_y_pred, cu_y_pred,
                                   params['n_clusters'])) and score_test
Exemplo n.º 21
0
import cuml
from cuml.test.utils import array_equal
import numpy as np
from sklearn.datasets import load_iris
from sklearn.datasets import make_regression
import pickle
from sklearn.manifold.t_sne import trustworthiness

regression_models = dict(LinearRegression=cuml.LinearRegression(),
                         Lasso=cuml.Lasso(),
                         Ridge=cuml.Ridge(),
                         ElasticNet=cuml.ElasticNet())

solver_models = dict(CD=cuml.CD(), SGD=cuml.SGD(eta0=0.005))

cluster_models = dict(KMeans=cuml.KMeans())

decomposition_models = dict(
    PCA=cuml.PCA(),
    TruncatedSVD=cuml.TruncatedSVD(),
)

decomposition_models_xfail = dict(
    GaussianRandomProjection=cuml.GaussianRandomProjection(),
    SparseRandomProjection=cuml.SparseRandomProjection())

neighbor_models = dict(NearestNeighbors=cuml.NearestNeighbors())

dbscan_model = dict(DBSCAN=cuml.DBSCAN())

umap_model = dict(UMAP=cuml.UMAP())
Exemplo n.º 22
0
                                                       fit_intercept),
    "Lasso":
    lambda fit_intercept=True: cuml.Lasso(fit_intercept=fit_intercept),
    "Ridge":
    lambda fit_intercept=True: cuml.Ridge(fit_intercept=fit_intercept),
    "ElasticNet":
    lambda fit_intercept=True: cuml.ElasticNet(fit_intercept=fit_intercept)
}

solver_models = {
    "CD": lambda: cuml.CD(),
    "SGD": lambda: cuml.SGD(eta0=0.005),
    "QN": lambda: cuml.QN(loss="softmax")
}

cluster_models = {"KMeans": lambda: cuml.KMeans()}

decomposition_models = {
    "PCA": lambda: cuml.PCA(),
    "TruncatedSVD": lambda: cuml.TruncatedSVD(),
}

decomposition_models_xfail = {
    "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(),
    "SparseRandomProjection": lambda: cuml.SparseRandomProjection()
}

neighbor_models = {"NearestNeighbors": lambda: cuml.NearestNeighbors()}

dbscan_model = {"DBSCAN": lambda: cuml.DBSCAN()}
Exemplo n.º 23
0
        if enable_gpu:
            pca = cuml.PCA(n_components=pca_components)
        else:
            pca = sklearn.decomposition.PCA(n_components=pca_components)

        df_fingerprints = pca.fit_transform(df_fingerprints)
        print('Runtime PCA time (hh:mm:ss.ms) {}'.format(datetime.now() -
                                                         task_start_time))
    else:
        pca = False
        print('PCA has been skipped')

    task_start_time = datetime.now()
    n_clusters = 7
    if enable_gpu:
        kmeans_float = cuml.KMeans(n_clusters=n_clusters)
    else:
        kmeans_float = sklearn.cluster.KMeans(n_clusters=n_clusters)
    kmeans_float.fit(df_fingerprints)
    print('Runtime Kmeans time (hh:mm:ss.ms) {}'.format(datetime.now() -
                                                        task_start_time))

    # UMAP
    task_start_time = datetime.now()
    if enable_gpu:
        umap = cuml.UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0)
    else:
        umap = umap.UMAP()

    Xt = umap.fit_transform(df_fingerprints)
    print('Runtime UMAP time (hh:mm:ss.ms) {}'.format(datetime.now() -
Exemplo n.º 24
0
    def _cluster(self, embedding, n_pca):
        """
        Generates UMAP transformation on Kmeans labels generated from
        molecular fingerprints.
        """
        if hasattr(embedding, 'compute'):
            embedding = embedding.compute()

        embedding = embedding.reset_index()

        # Before reclustering remove all columns that may interfere
        embedding, prop_series = self._remove_non_numerics(embedding)
        self.n_molecules, n_obs = embedding.shape

        if self.context.is_benchmark:
            molecular_embedding_sample, spearman_index = self._random_sample_from_arrays(
                embedding, n_samples=self.n_spearman)

        if n_pca and n_obs > n_pca:
            with MetricsLogger('pca', self.n_molecules) as ml:
                if self.pca == None:
                    self.pca = cuml.PCA(n_components=n_pca)
                    self.pca.fit(embedding)
                embedding = self.pca.transform(embedding)

        with MetricsLogger('kmeans', self.n_molecules) as ml:
            if self.n_molecules < MIN_RECLUSTER_SIZE:
                raise Exception(
                    'Reclustering less than %d molecules is not supported.' %
                    MIN_RECLUSTER_SIZE)

            kmeans_cuml = cuml.KMeans(n_clusters=self.n_clusters)
            kmeans_cuml.fit(embedding)
            kmeans_labels = kmeans_cuml.predict(embedding)

            ml.metric_name = 'silhouette_score'
            ml.metric_func = batched_silhouette_scores
            ml.metric_func_kwargs = {}
            ml.metric_func_args = (None, None)

            if self.context.is_benchmark:
                (embedding_sample,
                 kmeans_labels_sample), _ = self._random_sample_from_arrays(
                     embedding, kmeans_labels, n_samples=self.n_silhouette)
                ml.metric_func_args = (embedding_sample, kmeans_labels_sample)

        with MetricsLogger('umap', self.n_molecules) as ml:
            umap = cuml.manifold.UMAP()
            Xt = umap.fit_transform(embedding)

            ml.metric_name = 'spearman_rho'
            ml.metric_func = self._compute_spearman_rho
            ml.metric_func_args = (None, None)
            if self.context.is_benchmark:
                X_train_sample, _ = self._random_sample_from_arrays(
                    embedding, index=spearman_index)
                ml.metric_func_args = (molecular_embedding_sample,
                                       X_train_sample)

        # Add back the column required for plotting and to correlating data
        # between re-clustering
        embedding['cluster'] = kmeans_labels
        embedding['x'] = Xt[0]
        embedding['y'] = Xt[1]

        # Add back the prop columns
        for col in prop_series.keys():
            embedding[col] = prop_series[col]

        return embedding
Exemplo n.º 25
0

regression_models = {
    "LinearRegression": lambda: cuml.LinearRegression(),
    "Lasso": lambda: cuml.Lasso(),
    "Ridge": lambda: cuml.Ridge(),
    "ElasticNet": lambda: cuml.ElasticNet()
}

solver_models = {
    "CD": lambda: cuml.CD(),
    "SGD": lambda: cuml.SGD(eta0=0.005)
}

cluster_models = {
    "KMeans": lambda: cuml.KMeans()
}

decomposition_models = {
    "PCA": lambda: cuml.PCA(),
    "TruncatedSVD": lambda: cuml.TruncatedSVD(),
}

decomposition_models_xfail = {
    "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(),
    "SparseRandomProjection": lambda: cuml.SparseRandomProjection()
}

neighbor_models = {
    "NearestNeighbors": lambda: cuml.NearestNeighbors()
}