예제 #1
0
def test_score(nrows, ncols, nclusters, n_parts, input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      n_parts=n_parts,
                      cluster_std=0.01,
                      shuffle=False,
                      random_state=10)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        y_train = to_dask_cudf(y)
        y = y_train
    elif input_type == "array":
        X_train, y_train = X, y

    cumlModel = cumlKMeans(init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    cumlModel.fit(X_train)

    actual_score = cumlModel.score(X_train)

    local_model = cumlModel.get_combined_model()
    expected_score = local_model.score(X_train.compute())

    assert abs(actual_score - expected_score) < 1e-3
def test_predict_proba(dataset, datatype, n_neighbors,
                       n_parts, batch_size, client):
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNClf(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_probas = l_model.predict_proba(X_test)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)

    d_model = dKNNClf(client=client, n_neighbors=n_neighbors)
    d_model.fit(X_train, y_train)
    d_probas = d_model.predict_proba(X_test, convert_dtype=True)
    d_probas = da.compute(d_probas)[0]

    if datatype == 'dask_cudf':
        d_probas = list(map(lambda o: o.as_matrix()
                            if isinstance(o, DataFrame)
                            else o.to_array()[..., np.newaxis],
                            d_probas))

    check_probabilities(l_probas, d_probas)
예제 #3
0
def test_extract_partitions_shape(nrows, ncols, n_parts, input_type,
                                  colocated, client):
    adj_input_type = 'dataframe' if input_type == 'series' else input_type

    X_arr, y_arr = make_blobs(n_samples=nrows, n_features=ncols,
                              n_parts=n_parts)

    if adj_input_type == "dataframe" or input_type == "dataframe":
        X = to_dask_cudf(X_arr)
        y = to_dask_cudf(y_arr)
    elif input_type == "array":
        X, y = X_arr, y_arr

    if input_type == "series":
        X = X[X.columns[0]]

    if input_type == "dataframe" or input_type == "series":
        X_len_parts = X.map_partitions(len).compute()
        y_len_parts = y.map_partitions(len).compute()
    elif input_type == "array":
        X_len_parts = X.chunks[0]
        y_len_parts = y.chunks[0]

    if colocated:
        ddh = DistributedDataHandler.create((X, y), client)
        parts = [part.result() for worker, part in ddh.gpu_futures]
        for i in range(len(parts)):
            assert (parts[i][0].shape[0] == X_len_parts[i]) and (
                    parts[i][1].shape[0] == y_len_parts[i])
    else:
        ddh = DistributedDataHandler.create(X, client)
        parts = [part.result() for worker, part in ddh.gpu_futures]
        for i in range(len(parts)):
            assert (parts[i].shape[0] == X_len_parts[i])
예제 #4
0
def test_extract_partitions_worker_list(nrows, ncols, n_parts, input_type,
                                        colocated, cluster):
    client = Client(cluster)

    try:

        adj_input_type = 'dataframe' if input_type == 'series' else input_type

        X_arr, y_arr = make_blobs(n_samples=int(nrows),
                                  n_features=ncols,
                                  n_parts=n_parts)

        if adj_input_type == "dataframe" or input_type == "dataframe":
            X = to_dask_cudf(X_arr)
            y = to_dask_cudf(y_arr)
        elif input_type == "array":
            X, y = X_arr, y_arr

        if input_type == "series":
            X = X[X.columns[0]]

        if colocated:
            ddh = DistributedDataHandler.create((X, y), client)
        else:
            ddh = DistributedDataHandler.create(X, client)

        parts = list(map(lambda x: x[1], ddh.gpu_futures))
        assert len(parts) == n_parts
    finally:
        client.close()
예제 #5
0
def test_transform(nrows, ncols, nclusters, n_parts, input_type, cluster):

    client = None

    try:

        client = Client(cluster)

        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          shuffle=False,
                          random_state=10)
        y = y.astype('int64')

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
            labels = cp.squeeze(y_train.compute().to_pandas().values)
        elif input_type == "array":
            X_train, y_train = X, y
            labels = cp.squeeze(y_train.compute())

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)

        xformed = cumlModel.transform(X_train).compute()
        if input_type == "dataframe":
            xformed = cp.array(xformed
                               if len(xformed.shape) == 1
                               else xformed.as_gpu_matrix())

        if nclusters == 1:
            # series shape is (nrows,) not (nrows, 1) but both are valid
            # and equivalent for this test
            assert xformed.shape in [(nrows, nclusters), (nrows,)]
        else:
            assert xformed.shape == (nrows, nclusters)

        # The argmin of the transformed values should be equal to the labels
        # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1)
        xformed_labels = cp.argmin(xformed.reshape((int(nrows),
                                                    int(nclusters))), axis=1)

        assert sk_adjusted_rand_score(cp.asnumpy(labels),
                                      cp.asnumpy(xformed_labels))

    finally:
        client.close()
예제 #6
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      n_parts=n_parts,
                      cluster_std=0.01,
                      random_state=10)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        y_train = to_dask_cudf(y)
    elif input_type == "array":
        X_train, y_train = X, y

    cumlModel = cumlKMeans(init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    cumlModel.fit(X_train)
    cumlLabels = cumlModel.predict(X_train, delayed=delayed_predict)

    n_workers = len(list(client.has_what().keys()))

    # Verifying we are grouping partitions. This should be changed soon.
    if n_parts is not None:
        parts_len = n_parts
    else:
        parts_len = n_workers

    if input_type == "dataframe":
        assert cumlLabels.npartitions == parts_len
        cumlPred = cumlLabels.compute().values
        labels = y_train.compute().values
    elif input_type == "array":
        assert len(cumlLabels.chunks[0]) == parts_len
        cumlPred = cp.array(cumlLabels.compute())
        labels = cp.squeeze(y_train.compute())

    assert cumlPred.shape[0] == nrows
    assert cp.max(cumlPred) == nclusters - 1
    assert cp.min(cumlPred) == 0

    score = adjusted_rand_score(labels, cumlPred)

    print(str(score))

    assert 1.0 == score
예제 #7
0
def test_predict_and_score(dataset, datatype, n_neighbors, n_parts, batch_size,
                           client):
    X_train, X_test, y_train, y_test = dataset
    np_y_test = y_test

    l_model = lKNNReg(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_distances, l_indices = l_model.kneighbors(X_test)
    l_outputs = l_model.predict(X_test)
    local_out = (l_outputs, l_indices, l_distances)
    handmade_local_score = r2_score(y_test, l_outputs)
    handmade_local_score = round(float(handmade_local_score), 3)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)
    y_test = generate_dask_array(y_test, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)
        y_test = to_dask_cudf(y_test, client)

    d_model = dKNNReg(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_outputs, d_indices, d_distances = \
        d_model.predict(X_test, convert_dtype=True)
    distributed_out = da.compute(d_outputs, d_indices, d_distances)
    if datatype == 'dask_array':
        distributed_score = d_model.score(X_test, y_test)
        distributed_score = round(float(distributed_score), 3)

    if datatype == 'dask_cudf':
        distributed_out = list(
            map(
                lambda o: o.as_matrix()
                if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis],
                distributed_out))

    exact_match(local_out, distributed_out)

    if datatype == 'dask_array':
        assert distributed_score == pytest.approx(handmade_local_score,
                                                  abs=1e-2)
    else:
        y_pred = distributed_out[0]
        handmade_distributed_score = float(r2_score(np_y_test, y_pred))
        handmade_distributed_score = round(handmade_distributed_score, 3)
        assert handmade_distributed_score == pytest.approx(
            handmade_local_score, abs=1e-2)
예제 #8
0
파일: test_pca.py 프로젝트: teju85/cuml
def test_pca_fit(nrows, ncols, n_parts, input_type, cluster):

    client = Client(cluster)

    try:

        from cuml.dask.decomposition import PCA as daskPCA
        from sklearn.decomposition import PCA

        from cuml.dask.datasets import make_blobs

        X, _ = make_blobs(n_samples=nrows,
                          n_features=ncols,
                          centers=1,
                          n_parts=n_parts,
                          cluster_std=0.5,
                          random_state=10,
                          dtype=np.float32)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            X_cpu = X_train.compute().to_pandas().values
        elif input_type == "array":
            X_train = X
            X_cpu = cp.asnumpy(X_train.compute())

        try:

            cupca = daskPCA(n_components=5, whiten=True)
            cupca.fit(X_train)
        except Exception as e:
            print(str(e))

        skpca = PCA(n_components=5, whiten=True, svd_solver="full")
        skpca.fit(X_cpu)

        from cuml.test.utils import array_equal

        all_attr = [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_'
        ]

        for attr in all_attr:
            with_sign = False if attr in ['components_'] else True
            cuml_res = (getattr(cupca, attr))
            if type(cuml_res) == np.ndarray:
                cuml_res = cuml_res.as_matrix()
            skl_res = getattr(skpca, attr)
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
    finally:
        client.close()
예제 #9
0
def test_pca_fit(data_info, input_type, client):

    nrows, ncols, n_parts = data_info
    if nrows == int(9e6) and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 256
            ncols = ncols * pytest.max_gpu_memory // 256
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD

    from cuml.dask.datasets import make_blobs

    X, _ = make_blobs(n_samples=nrows,
                      n_features=ncols,
                      centers=1,
                      n_parts=n_parts,
                      cluster_std=0.5,
                      random_state=10,
                      dtype=np.float32)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        X_cpu = X_train.compute().to_pandas().values
    elif input_type == "array":
        X_train = X
        X_cpu = cp.asnumpy(X_train.compute())

    cutsvd = daskTPCA(n_components=5)
    cutsvd.fit(X_train)

    sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
    sktsvd.fit(X_cpu)

    all_attr = [
        'singular_values_', 'components_', 'explained_variance_',
        'explained_variance_ratio_'
    ]

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.to_numpy()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
예제 #10
0
def test_pca_fit(data_info, input_type, cluster):

    client = Client(cluster)
    nrows, ncols, n_parts = data_info

    try:

        from cuml.dask.decomposition import TruncatedSVD as daskTPCA
        from sklearn.decomposition import TruncatedSVD

        from cuml.dask.datasets import make_blobs

        X, _ = make_blobs(n_samples=nrows,
                          n_features=ncols,
                          centers=1,
                          n_parts=n_parts,
                          cluster_std=0.5,
                          random_state=10,
                          dtype=np.float32)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            X_cpu = X_train.compute().to_pandas().values
        elif input_type == "array":
            X_train = X
            X_cpu = cp.asnumpy(X_train.compute())

        cutsvd = daskTPCA(n_components=5)
        cutsvd.fit(X_train)

        sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
        sktsvd.fit(X_cpu)

        all_attr = [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_'
        ]

    finally:
        client.close()

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
예제 #11
0
 def _check_input_fit(self, X, is_categories=False):
     """Helper function to check input of fit within the multi-gpu model"""
     if isinstance(X, (dask.array.core.Array, cp.ndarray)):
         self._set_input_type('array')
         if is_categories:
             X = X.transpose()
         if isinstance(X, cp.ndarray):
             return DataFrame(X)
         else:
             return to_dask_cudf(X, client=self.client)
     else:
         self._set_input_type('df')
         return X
예제 #12
0
def test_score(dataset, datatype, n_neighbors, n_parts, client):
    X_train, X_test, y_train, y_test = dataset

    if not n_parts:
        n_parts = len(client.has_what().keys())

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)
    y_test = generate_dask_array(y_test, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)
        y_test = to_dask_cudf(y_test, client)

    d_model = dKNNReg(client=client, n_neighbors=n_neighbors)
    d_model.fit(X_train, y_train)
    d_outputs, d_indices, d_distances = \
        d_model.predict(X_test, convert_dtype=True)
    distributed_out = da.compute(d_outputs, d_indices, d_distances)

    if datatype == 'dask_cudf':
        distributed_out = list(
            map(
                lambda o: o.as_matrix()
                if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis],
                distributed_out))
    cuml_score = d_model.score(X_test, y_test)

    if datatype == 'dask_cudf':
        y_test = y_test.compute().as_matrix()
    else:
        y_test = y_test.compute()
    manual_score = accuracy_score(y_test, distributed_out[0])

    assert cuml_score == manual_score
예제 #13
0
def test_predict(dataset, datatype, n_neighbors, n_parts, batch_size, client):
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNReg(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_distances, l_indices = l_model.kneighbors(X_test)
    l_outputs = l_model.predict(X_test)
    local_out = (l_outputs, l_indices, l_distances)

    if not n_parts:
        n_parts = len(client.has_what().keys())

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)

    d_model = dKNNReg(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_outputs, d_indices, d_distances = \
        d_model.predict(X_test, convert_dtype=True)
    distributed_out = da.compute(d_outputs, d_indices, d_distances)

    if datatype == 'dask_cudf':
        distributed_out = list(
            map(
                lambda o: o.as_matrix()
                if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis],
                distributed_out))

    match_test(local_out, distributed_out)
    accuracy_score(local_out[0], distributed_out[0]) > 0.12
예제 #14
0
def test_predict_and_score(dataset, datatype, parameters, client):
    n_neighbors, n_parts, batch_size = parameters
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNReg(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_outputs = l_model.predict(X_test)
    handmade_local_score = r2_score(y_test, l_outputs)
    handmade_local_score = round(float(handmade_local_score), 3)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)
    y_test = generate_dask_array(y_test, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)
        y_test = to_dask_cudf(y_test, client)

    d_model = dKNNReg(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_outputs = d_model.predict(X_test, convert_dtype=True)
    d_outputs = d_outputs.compute()

    d_outputs = d_outputs.to_numpy() \
        if isinstance(d_outputs, DataFrame) \
        else d_outputs

    exact_match(l_outputs, d_outputs)

    distributed_score = d_model.score(X_test, y_test)
    distributed_score = round(float(distributed_score), 3)
    assert distributed_score == pytest.approx(handmade_local_score, abs=1e-2)
예제 #15
0
def test_score(nrows, ncols, nclusters, n_parts, input_type, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          shuffle=False,
                          random_state=10)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
            y = y_train
        elif input_type == "array":
            X_train, y_train = X, y

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)

        actual_score = cumlModel.score(X_train)

        predictions = cumlModel.predict(X_train).compute()

        if input_type == "dataframe":
            X = cp.array(X_train.compute().as_gpu_matrix())
            predictions = cp.array(predictions)

            centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix())
        elif input_type == "array":
            X = X_train.compute()
            centers = cumlModel.cluster_centers_

        expected_score = 0
        for idx, label in enumerate(predictions):

            x = X[idx]
            y = centers[label]

            dist = cp.sqrt(cp.sum((x - y)**2))
            expected_score += dist**2

        assert actual_score + SCORE_EPS \
            >= (-1 * expected_score) \
            >= actual_score - SCORE_EPS

    finally:
        client.close()