Пример #1
0
def test_rf_host_memory_leak(large_clf, estimator_type):
    import gc
    import os

    try:
        import psutil
    except ImportError:
        pytest.skip("psutil not installed")

    process = psutil.Process(os.getpid())

    X, y = large_clf
    X = X.astype(np.float32)

    if estimator_type == 'classification':
        base_model = curfc(max_depth=10, n_estimators=100, seed=123)
        y = y.astype(np.int32)
    else:
        base_model = curfr(max_depth=10, n_estimators=100, seed=123)
        y = y.astype(np.float32)

    # Pre-fit once - this is our baseline and memory usage
    # should not significantly exceed it after later fits
    base_model.fit(X, y)
    gc.collect()
    initial_baseline_mem = process.memory_info().rss

    for i in range(5):
        base_model.fit(X, y)
        gc.collect()
        final_mem = process.memory_info().rss

    # Some tiny allocations may occur, but we shuld not leak
    # without bounds, which previously happened
    assert (final_mem - initial_baseline_mem) < 2e6
Пример #2
0
    def test_cuml_classifier(self):
        """
        Validate cuML classifier with wrapper
        """
        # NOTE: this is currently untested as I wasn't able to install cuML
        X, y = make_classification(n_samples=400,
                                   n_features=10,
                                   n_informative=2,
                                   n_redundant=3,
                                   n_classes=2,
                                   n_clusters_per_class=2,
                                   random_state=8311982)
        X_train, X_test, y_train, y_test = tts(X, y)

        # Convert to cudf dataframes
        X_train = cudf.DataFrame(X_train)
        y_train = cudf.Series(y_train)
        X_test = cudf.DataFrame(X_test)
        y_test = cudf.Series(y_test)

        model = classifier(curfc(n_estimators=40, max_depth=8, max_features=1))
        oz = classification_report(model,
                                   X_train,
                                   y_train,
                                   X_test,
                                   y_test,
                                   show=False)
        assert is_fitted(oz)
Пример #3
0
def test_rf_classification_default(datatype, column_info, nrows):

    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=0,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train, y_train)
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_acc = accuracy_score(y_test, fil_preds)
    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cu_acc - 0.02)
Пример #4
0
def test_cuml_rf_classifier(n_classes):
    n_samples = 100
    X, y = make_classification(n_samples=n_samples,
                               n_features=8,
                               n_informative=8,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=n_classes,
                               random_state=2021)
    X, y = X.astype(np.float32), y.astype(np.float32)
    cuml_model = curfc(max_features=1.0,
                       max_samples=0.1,
                       n_bins=128,
                       min_samples_leaf=2,
                       random_state=123,
                       n_streams=1,
                       n_estimators=10,
                       max_leaves=-1,
                       max_depth=16,
                       accuracy_metric="mse")
    cuml_model.fit(X, y)

    with pytest.raises(RuntimeError):
        # cuML RF classifier is not supported yet
        explainer = TreeExplainer(model=cuml_model)
        explainer.shap_values(X)
Пример #5
0
def test_accuracy(nrows, ncols, n_info, datatype):

    use_handle = True
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=5)
    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(np.int32)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(np.int32)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=1.0,
                       n_bins=8,
                       split_algo=0,
                       split_criterion=0,
                       min_rows_per_node=2,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=-1)
    cuml_model.fit(X_train, y_train)
    cu_predict = cuml_model.predict(X_test)
    cu_acc = cu_acc_score(y_test, cu_predict)
    cu_acc_using_sk = sk_acc_score(y_test, cu_predict)
    # compare the accuracy of the two models
    assert array_equal(cu_acc, cu_acc_using_sk)
Пример #6
0
def test_create_classification_model(max_features,
                                     max_depth, n_estimators, n_bins):

    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       n_bins=n_bins,
                       n_estimators=n_estimators,
                       max_depth=max_depth)
    params = cuml_model.get_params()
    cuml_model2 = curfc()
    cuml_model2.set_params(**params)
    verfiy_params = cuml_model2.get_params()
    assert params['max_features'] == verfiy_params['max_features']
    assert params['max_depth'] == verfiy_params['max_depth']
    assert params['n_estimators'] == verfiy_params['n_estimators']
    assert params['n_bins'] == verfiy_params['n_bins']
Пример #7
0
def test_rf_classification_seed(datatype, column_info, nrows):

    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=0,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    for i in range(20):
        seed = random.randint(100, 1e5)
        # Initialize, fit and predict using cuML's
        # random forest classification model
        cu_class = curfc(seed=seed, n_streams=1)
        cu_class.fit(X_train, y_train)

        # predict using FIL
        fil_preds_orig = cu_class.predict(X_test, predict_model="GPU")
        cu_preds_orig = cu_class.predict(X_test, predict_model="CPU")
        cu_acc_orig = accuracy_score(y_test, cu_preds_orig)
        fil_preds_orig = np.reshape(fil_preds_orig, np.shape(cu_preds_orig))

        fil_acc_orig = accuracy_score(y_test, fil_preds_orig)

        # Initialize, fit and predict using cuML's
        # random forest classification model
        cu_class2 = curfc(seed=seed, n_streams=1)
        cu_class2.fit(X_train, y_train)

        # predict using FIL
        fil_preds_rerun = cu_class2.predict(X_test, predict_model="GPU")
        cu_preds_rerun = cu_class2.predict(X_test, predict_model="CPU")
        cu_acc_rerun = accuracy_score(y_test, cu_preds_rerun)
        fil_preds_rerun = np.reshape(fil_preds_rerun, np.shape(cu_preds_rerun))

        fil_acc_rerun = accuracy_score(y_test, fil_preds_rerun)

        assert fil_acc_orig == fil_acc_rerun
        assert cu_acc_orig == cu_acc_rerun
        assert (fil_preds_orig == fil_preds_rerun).all()
        assert (cu_preds_orig == cu_preds_rerun).all()
Пример #8
0
def test_concat_memory_leak(large_clf, estimator_type):
    import gc
    import os

    try:
        import psutil
    except ImportError:
        pytest.skip("psutil not installed")

    process = psutil.Process(os.getpid())

    X, y = large_clf
    X = X.astype(np.float32)

    # Build a series of RF models
    n_models = 10
    if estimator_type == 'classification':
        base_models = [
            curfc(max_depth=10, n_estimators=100, random_state=123)
            for i in range(n_models)
        ]
        y = y.astype(np.int32)
    elif estimator_type == 'regression':
        base_models = [
            curfr(max_depth=10, n_estimators=100, random_state=123)
            for i in range(n_models)
        ]
        y = y.astype(np.float32)
    else:
        assert False

    # Pre-fit once - this is our baseline and memory usage
    # should not significantly exceed it after later fits
    for model in base_models:
        model.fit(X, y)

    # Just concatenate over and over in a loop
    concat_models = base_models[1:]
    init_model = base_models[0]
    other_handles = [
        model._obtain_treelite_handle() for model in concat_models
    ]
    init_model._concatenate_treelite_handle(other_handles)

    gc.collect()
    initial_baseline_mem = process.memory_info().rss
    for i in range(10):
        init_model._concatenate_treelite_handle(other_handles)
        gc.collect()
        used_mem = process.memory_info().rss
        logger.debug("memory at rep %2d: %d m" %
                     (i, (used_mem - initial_baseline_mem) / 1e6))

    gc.collect()
    used_mem = process.memory_info().rss
    logger.info("Final memory delta: %d" %
                ((used_mem - initial_baseline_mem) / 1e6))
    assert (used_mem - initial_baseline_mem) < 1e6
Пример #9
0
def test_rf_classification(small_clf, datatype, split_algo,
                           max_samples, max_features,
                           use_experimental_backend):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features, max_samples=max_samples,
                       n_bins=16, split_algo=split_algo, split_criterion=0,
                       min_samples_leaf=2, random_state=123, n_streams=1,
                       n_estimators=40, handle=handle, max_leaves=-1,
                       max_depth=16,
                       use_experimental_backend=use_experimental_backend)
    f = io.StringIO()
    with redirect_stdout(f):
        cuml_model.fit(X_train, y_train)
    captured_stdout = f.getvalue()
    if use_experimental_backend:
        is_fallback_used = False
        if split_algo != 1:
            assert ('Experimental backend does not yet support histogram ' +
                    'split algorithm' in captured_stdout)
            is_fallback_used = True
        if is_fallback_used:
            assert ('Not using the experimental backend due to above ' +
                    'mentioned reason(s)' in captured_stdout)
        else:
            assert ('Using experimental backend for growing trees'
                    in captured_stdout)
    else:
        assert captured_stdout == ''
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='auto')
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2, max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Пример #10
0
def rf_classification(datatype, array_type, max_features, max_samples,
                      fixture):
    X, y = fixture
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    handle, stream = get_handle(True, n_streams=1)
    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features, max_samples=max_samples,
                       n_bins=16, split_criterion=0,
                       min_samples_leaf=2, random_state=123,
                       n_estimators=40, handle=handle, max_leaves=-1,
                       max_depth=16)
    if array_type == 'dataframe':
        X_train_df = cudf.DataFrame(X_train)
        y_train_df = cudf.Series(y_train)
        X_test_df = cudf.DataFrame(X_test)
        cuml_model.fit(X_train_df, y_train_df)
        cu_proba_gpu = np.array(cuml_model.predict_proba(X_test_df)
                                .as_gpu_matrix())
        cu_preds_cpu = cuml_model.predict(X_test_df,
                                          predict_model="CPU").to_array()
        cu_preds_gpu = cuml_model.predict(X_test_df,
                                          predict_model="GPU").to_array()
    else:
        cuml_model.fit(X_train, y_train)
        cu_proba_gpu = cuml_model.predict_proba(X_test)
        cu_preds_cpu = cuml_model.predict(X_test, predict_model="CPU")
        cu_preds_gpu = cuml_model.predict(X_test, predict_model="GPU")
    np.testing.assert_array_equal(cu_preds_gpu,
                                  np.argmax(cu_proba_gpu, axis=1))

    cu_acc_cpu = accuracy_score(y_test, cu_preds_cpu)
    cu_acc_gpu = accuracy_score(y_test, cu_preds_gpu)
    assert cu_acc_cpu == pytest.approx(cu_acc_gpu, abs=0.01, rel=0.1)

    # sklearn random forest classification model
    # initialization, fit and predict
    if y.size < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2, max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        sk_proba = sk_model.predict_proba(X_test)
        assert cu_acc_cpu >= sk_acc - 0.07
        assert cu_acc_gpu >= sk_acc - 0.07
        # 0.06 is the highest relative error observed on CI, within
        # 0.0061 absolute error boundaries seen previously
        check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)
Пример #11
0
def test_rf_classification_seed(small_clf, datatype):

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    for i in range(8):
        seed = random.randint(100, 1e5)
        # Initialize, fit and predict using cuML's
        # random forest classification model
        cu_class = curfc(random_state=seed, n_streams=1)
        cu_class.fit(X_train, y_train)

        # predict using FIL
        fil_preds_orig = cu_class.predict(X_test,
                                          predict_model="GPU")
        cu_preds_orig = cu_class.predict(X_test,
                                         predict_model="CPU")
        cu_acc_orig = accuracy_score(y_test, cu_preds_orig)
        fil_preds_orig = np.reshape(fil_preds_orig, np.shape(cu_preds_orig))

        fil_acc_orig = accuracy_score(y_test, fil_preds_orig)

        # Initialize, fit and predict using cuML's
        # random forest classification model
        cu_class2 = curfc(random_state=seed, n_streams=1)
        cu_class2.fit(X_train, y_train)

        # predict using FIL
        fil_preds_rerun = cu_class2.predict(X_test,
                                            predict_model="GPU")
        cu_preds_rerun = cu_class2.predict(X_test, predict_model="CPU")
        cu_acc_rerun = accuracy_score(y_test, cu_preds_rerun)
        fil_preds_rerun = np.reshape(fil_preds_rerun, np.shape(cu_preds_rerun))

        fil_acc_rerun = accuracy_score(y_test, fil_preds_rerun)

        assert fil_acc_orig == fil_acc_rerun
        assert cu_acc_orig == cu_acc_rerun
        assert (fil_preds_orig == fil_preds_rerun).all()
        assert (cu_preds_orig == cu_preds_rerun).all()
Пример #12
0
    def _setup(self, config):
        [X_train, X_test, y_train, y_test] = get_pinned_object(data_id)

        self.cuml_model = curfc(n_estimators=config.get("estimators", 40),
                                max_depth=config.get("depth", 16),
                                max_features=1.0)
        self.X_cudf_train = cudf.DataFrame.from_pandas(X_train)
        self.X_cudf_test = cudf.DataFrame.from_pandas(X_test)
        self.y_cudf_train = cudf.Series(y_train.values)
        self.y_test = y_test
Пример #13
0
def test_rf_classification_proba(datatype, split_algo, rows_sample, nrows,
                                 column_info, max_features):
    use_handle = True
    ncols, n_info = column_info

    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds_proba = cuml_model.predict_proba(X_test,
                                               output_class=True,
                                               threshold=0.5,
                                               algo='auto')
    y_proba = np.zeros(np.shape(fil_preds_proba))
    y_proba[:, 1] = y_test
    y_proba[:, 0] = 1.0 - y_test
    fil_mse = mean_squared_error(y_proba, fil_preds_proba)
    if nrows < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds_proba = sk_model.predict_proba(X_test)
        sk_mse = mean_squared_error(y_proba, sk_preds_proba)
        # Max difference of 0.0061 is seen between the mse values of
        # predict proba function of fil and sklearn
        assert fil_mse <= (sk_mse + 0.0061)
Пример #14
0
def test_rf_classification(datatype, split_algo, rows_sample, nrows,
                           column_info, max_features):
    use_handle = True
    ncols, n_info = column_info

    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='BATCH_TREE_REORG')
    cu_predict = cuml_model.predict(X_test, predict_model="CPU")
    cuml_acc = accuracy_score(y_test, cu_predict)
    fil_acc = accuracy_score(y_test, fil_preds)

    if nrows < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Пример #15
0
def test_rf_nbins_small(small_clf):

    X, y = small_clf
    X = X.astype(np.float32)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train[0:3, :], y_train[0:3])
Пример #16
0
def test_rf_classification(small_clf, datatype, max_samples, max_features):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=40,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
    )
    cuml_model.fit(X_train, y_train)

    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   threshold=0.5,
                                   algo="auto")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(
            n_estimators=40,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (
        cuml_acc - 0.07
    )  # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
Пример #17
0
def test_rf_memory_leakage(fil_sparse_format, column_info, nrows):
    n_iter = 30
    datatype = np.float32
    use_handle = True
    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=0,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Warmup. Some modules that are used in RF allocate space on the device
    # and consume memory. This is to make sure that the allocation is done
    # before the first call to get_memory_info.
    base_model = curfc(handle=handle)
    base_model.fit(X_train, y_train)
    free_mem = cuda.current_context().get_memory_info()[0]

    rfc_model = curfc(handle=handle)
    rfc_model.fit(X_train, y_train)

    # Calculate the memory free after fitting the cuML RF model
    delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
    cuml_mods = curfc(handle=handle)
    cuml_mods.fit(X_train, y_train)

    for i in range(n_iter):
        cuml_mods.predict(X_train, predict_model="GPU")
        handle.sync()
        delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
        assert delta_mem == 0
Пример #18
0
def test_rf_multiclass_classifier_gtil_integration(tmpdir):
    X, y = load_iris(return_X_y=True)
    X, y = X.astype(np.float32), y.astype(np.int32)
    clf = curfc(max_depth=3, random_state=0, n_estimators=10)
    clf.fit(X, y)
    expected_prob = clf.predict_proba(X)

    checkpoint_path = os.path.join(tmpdir, 'checkpoint.tl')
    clf.convert_to_treelite_model().to_treelite_checkpoint(checkpoint_path)

    tl_model = treelite.Model.deserialize(checkpoint_path)
    out_prob = treelite.gtil.predict(tl_model, X, pred_margin=True)
    np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
Пример #19
0
def test_rf_printing(capfd, n_estimators, detailed_printing):

    X, y = make_classification(n_samples=500,
                               n_features=10,
                               n_clusters_per_class=1,
                               n_informative=5,
                               random_state=94929,
                               n_classes=2)

    X = X.astype(np.float32)
    y = y.astype(np.int32)

    # Create a handle for the cuml model
    handle, stream = get_handle(True, n_streams=1)

    # Initialize cuML Random Forest classification model
    cuml_model = curfc(handle=handle,
                       max_features=1.0,
                       rows_sample=1.0,
                       n_bins=16,
                       split_algo=0,
                       split_criterion=0,
                       min_rows_per_node=2,
                       random_state=23707,
                       n_streams=1,
                       n_estimators=n_estimators,
                       max_leaves=-1,
                       max_depth=16)

    # Train model on the data
    cuml_model.fit(X, y)

    if detailed_printing:
        cuml_model.print_detailed()
    else:
        cuml_model.print_summary()

    # Read the captured output
    printed_output = capfd.readouterr().out

    # Test 1: Output is non-zero
    assert '' != printed_output

    # Count the number of trees printed
    tree_count = 0
    for line in printed_output.split('\n'):
        if line.strip().startswith('Tree #'):
            tree_count += 1

    # Test 2: Correct number of trees are printed
    assert n_estimators == tree_count
Пример #20
0
def test_rf_classification_float64(small_clf, datatype, convert_dtype):

    X, y = small_clf
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )
    X_test = X_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert cu_acc >= (sk_acc - 0.07)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32 and convert_dtype:
        fil_preds = cuml_model.predict(
            X_test, predict_model="GPU", convert_dtype=convert_dtype
        )
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

        fil_acc = accuracy_score(y_test, fil_preds)
        assert fil_acc >= (cu_acc - 0.07)  # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
    # if GPU predict cannot be used, display warning and use CPU predict
    elif datatype[1] == np.float64:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            fil_preds = cuml_model.predict(
                X_test, predict_model="GPU",
                convert_dtype=convert_dtype
            )
            assert("GPU based predict only accepts "
                   "np.float32 data. The model was "
                   "trained on np.float64 data hence "
                   "cannot use GPU-based prediction! "
                   "\nDefaulting to CPU-based Prediction. "
                   "\nTo predict on float-64 data, set "
                   "parameter predict_model = 'CPU'"
                   in str(w[-1].message))
Пример #21
0
def test_rf_get_text(n_estimators, detailed_text):

    X, y = make_classification(
        n_samples=500,
        n_features=10,
        n_clusters_per_class=1,
        n_informative=5,
        random_state=94929,
        n_classes=2,
    )

    X = X.astype(np.float32)
    y = y.astype(np.int32)

    # Create a handle for the cuml model
    handle, stream = get_handle(True, n_streams=1)

    # Initialize cuML Random Forest classification model
    cuml_model = curfc(
        handle=handle,
        max_features=1.0,
        max_samples=1.0,
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=23707,
        n_streams=1,
        n_estimators=n_estimators,
        max_leaves=-1,
        max_depth=16,
    )

    # Train model on the data
    cuml_model.fit(X, y)

    if detailed_text:
        text_output = cuml_model.get_detailed_text()
    else:
        text_output = cuml_model.get_summary_text()

    # Test 1: Output is non-zero
    assert "" != text_output

    # Count the number of trees printed
    tree_count = 0
    for line in text_output.split("\n"):
        if line.strip().startswith("Tree #"):
            tree_count += 1

    # Test 2: Correct number of trees are printed
    assert n_estimators == tree_count
Пример #22
0
def test_rf_classification(small_clf, datatype, split_algo, rows_sample,
                           max_features):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       random_state=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='auto')
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Пример #23
0
    def test_for_memory_leak():
        cuml_mods = curfc(handle=handle)
        cuml_mods.fit(X_train, y_train)
        handle.sync()  # just to be sure
        # Calculate the memory free after fitting the cuML model
        delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
        assert delta_mem == 0

        for i in range(2):
            cuml_mods.predict(X_test, predict_model="GPU",
                              fil_sparse_format=fil_sparse_format)
            handle.sync()  # just to be sure
            # Calculate the memory free after predicting the cuML model
            delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
            assert delta_mem == 0
Пример #24
0
def test_rf_classification_float64(datatype, column_info, nrows,
                                   convert_dtype):

    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=0,
                               n_classes=2)
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert cu_acc >= (sk_acc - 0.07)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32 and convert_dtype:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       convert_dtype=convert_dtype)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

        fil_acc = accuracy_score(y_test, fil_preds)
        assert fil_acc >= (cu_acc - 0.02)
    else:
        with pytest.raises(TypeError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           convert_dtype=convert_dtype)
Пример #25
0
def test_multiple_fits_classification(large_clf, n_estimators, n_bins):

    datatype = np.float32
    X, y = large_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    cuml_model = curfc(n_bins=n_bins, n_estimators=n_estimators, max_depth=10)

    # Calling multiple fits
    cuml_model.fit(X, y)

    cuml_model.fit(X, y)

    # Check if params are still intact
    params = cuml_model.get_params()
    assert params['n_estimators'] == n_estimators
    assert params['n_bins'] == n_bins
Пример #26
0
def test_rf_predict_numpy(datatype, use_handle, split_algo, n_info, nrows,
                          ncols):
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=5)
    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(np.int32)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(np.int32)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=1.0,
                       n_bins=8,
                       split_algo=split_algo,
                       min_rows_per_node=2,
                       n_estimators=30,
                       handle=handle,
                       max_leaves=-1)
    cuml_model.fit(X_train, y_train)
    cu_predict = cuml_model.predict(X_test)
    cu_acc = accuracy_score(y_test, cu_predict)

    if nrows < 500000:
        # sklearn random forest classification model
        # initialization, fit and predict
        sk_model = skrfc(n_estimators=40,
                         max_depth=None,
                         min_samples_split=2,
                         max_features=1.0,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)

        # compare the accuracy of the two models
        assert cu_acc >= (sk_acc - 0.07)
Пример #27
0
def test_cuml_rf_classifier(n_classes, input_type):
    n_samples = 100
    X, y = make_classification(n_samples=n_samples,
                               n_features=8,
                               n_informative=8,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=n_classes,
                               random_state=2021)
    X, y = X.astype(np.float32), y.astype(np.float32)
    if input_type == 'cupy':
        X, y = cp.array(X), cp.array(y)
    elif input_type == 'cudf':
        X, y = cudf.DataFrame(X), cudf.Series(y)
    cuml_model = curfc(max_features=1.0,
                       max_samples=0.1,
                       n_bins=128,
                       min_samples_leaf=2,
                       random_state=123,
                       n_streams=1,
                       n_estimators=10,
                       max_leaves=-1,
                       max_depth=16,
                       accuracy_metric="mse")
    cuml_model.fit(X, y)
    pred = cuml_model.predict_proba(X)

    explainer = TreeExplainer(model=cuml_model)
    out = explainer.shap_values(X)
    if input_type == 'cupy':
        pred = pred.get()
        out = out.get()
        expected_value = explainer.expected_value.get()
    elif input_type == 'cudf':
        pred = pred.to_numpy()
        out = out.get()
        expected_value = explainer.expected_value.get()
    else:
        expected_value = explainer.expected_value
    # SHAP values should add up to predicted score
    expected_value = expected_value.reshape(-1, 1)
    shap_sum = np.sum(out, axis=2) + np.tile(expected_value, (1, n_samples))
    pred = np.transpose(pred, (1, 0))
    np.testing.assert_almost_equal(shap_sum, pred, decimal=4)
    def _train(self):
        iteration = getattr(self, "iteration", 0)
        # print(self._dataset)
        if compute == "GPU":
            # split data
            X_train, X_test, y_train, y_test = train_test_split(
                self._dataset.loc[:, self._dataset.columns != self._y_label],
                self._dataset[self._y_label],
                train_size=0.8,
                shuffle=True,
                random_state=42,
            )
            self.rf_model = curfc(
                n_estimators=self._model_params["n_estimators"],
                max_depth=self._model_params["max_depth"],
                n_bins=self._model_params["n_bins"],
                max_features=self._model_params["max_features"],
            )
            X_train = X_train.astype('float32')
            X_test = X_test.astype('float32')

        # train model
        with PerfTimer() as train_timer:
            trained_model = self.rf_model.fit(X_train,
                                              y_train.astype("float32"))
        training_time = train_timer.duration

        # evaluate perf
        with PerfTimer() as inference_timer:
            test_accuracy = r2_score(y_test.astype("float32"),
                                     trained_model.predict(X_test))
        infer_time = inference_timer.duration

        # update best model [ assumes maximization of perf metric ]
        if test_accuracy > self._global_best_test_accuracy:
            self._global_best_test_accuracy = test_accuracy
            self._global_best_model = trained_model

        return {
            "test_accuracy": test_accuracy,
            "train_time": round(training_time, 4),
            "infer_time": round(infer_time, 4),
            "is_bad": not math.isfinite(test_accuracy),
        }
Пример #29
0
def test_rf_nbins_small(small_clf):
    X, y = small_clf
    X = X.astype(np.float32)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )
    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()

    # display warning when nbins less than samples
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        cuml_model.fit(X_train[0:3, :], y_train[0:3])
        assert("The number of bins, `n_bins` is greater than "
               "the number of samples used for training. "
               "Changing `n_bins` to number of training samples."
               in str(w[-1].message))
Пример #30
0
def test_rf_classification(datatype, split_algo,
                           n_info, nrows, ncols,
                           max_depth, rows_sample):
    use_handle = True
    if split_algo == 1 and max_depth < 0:
        pytest.xfail("Unlimited depth not supported with quantile")

    train_rows = np.int32(nrows*0.8)
    X, y = make_classification(n_samples=nrows, n_features=ncols,
                               n_clusters_per_class=1, n_informative=n_info,
                               random_state=123, n_classes=5)
    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(np.int32)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(np.int32)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=1.0, rows_sample=rows_sample,
                       n_bins=8, split_algo=split_algo, split_criterion=0,
                       min_rows_per_node=2,
                       n_estimators=40, handle=handle, max_leaves=-1,
                       max_depth=max_depth)
    cuml_model.fit(X_train, y_train)
    cu_predict = cuml_model.predict(X_test)
    cu_acc = accuracy_score(y_test, cu_predict)

    if nrows < 500000:
        # sklearn random forest classification model
        # initialization, fit and predict
        sk_model = skrfc(n_estimators=40,
                         max_depth=(max_depth if max_depth > 0 else None),
                         min_samples_split=2, max_features=1.0,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)

        # compare the accuracy of the two models
        if max_depth > 1:
            assert cu_acc >= (sk_acc - 0.07)