Пример #1
0
def _build_treelite_classifier(m, data, arg={}, tmpdir=None):
    """Setup function for treelite classification benchmarking"""
    from cuml.utils.import_utils import has_treelite, has_xgboost
    if has_treelite():
        import treelite
        import treelite.runtime
    else:
        raise ImportError("No treelite package found")
    if has_xgboost():
        import xgboost as xgb
    else:
        raise ImportError("No XGBoost package found")

    max_depth = arg["max_depth"]
    num_rounds = arg["num_rounds"]
    n_feature = data[0].shape[1]
    train_size = data[0].shape[0]
    model_name = f"xgb_{max_depth}_{num_rounds}_{n_feature}_{train_size}.model"
    model_path = os.path.join(tmpdir, model_name)

    bst = xgb.Booster()
    bst.load_model(model_path)
    tl_model = treelite.Model.from_xgboost(bst)
    tl_model.export_lib(toolchain="gcc",
                        libpath=model_path + "treelite.so",
                        params={'parallel_comp': 40},
                        verbose=False)
    return treelite.runtime.Predictor(model_path + "treelite.so",
                                      verbose=False)
Пример #2
0
def _treelite_format_hook(data):
    """Helper function converting data into treelite format"""
    from cuml.utils.import_utils import has_treelite

    if has_treelite():
        import treelite
        import treelite.runtime
    else:
        raise ImportError("No treelite package found")
    return treelite.runtime.Batch.from_npy2d(data[0]), data[1]
Пример #3
0
def _build_treelite_classifier(m, data, arg={}):
    """Setup function for treelite classification benchmarking"""
    from cuml.utils.import_utils import has_treelite, has_xgboost
    if has_treelite():
        import treelite
        import treelite.runtime
    else:
        raise ImportError("No treelite package found")
    if has_xgboost():
        import xgboost as xgb
    else:
        raise ImportError("No XGBoost package found")

    # use maximum 1e5 rows to train the model
    train_size = min(data[0].shape[0], 100000)
    dtrain = xgb.DMatrix(data[0][:train_size, :], label=data[1][:train_size])
    params = {
        "silent": 1,
        "eval_metric": "error",
        "objective": "binary:logistic"
    }
    params.update(arg)
    max_depth = arg["max_depth"]
    num_rounds = arg["num_rounds"]
    n_feature = data[0].shape[1]

    tmpdir = tempfile.mkdtemp()
    model_name = f"xgb_{max_depth}_{num_rounds}_{n_feature}_{train_size}.model"
    model_path = os.path.join(tmpdir, model_name)

    bst = xgb.train(params, dtrain, num_rounds)
    tl_model = treelite.Model.from_xgboost(bst)
    tl_model.export_lib(toolchain="gcc",
                        libpath=model_path + "treelite.so",
                        params={'parallel_comp': 40},
                        verbose=False)
    return treelite.runtime.Predictor(model_path + "treelite.so",
                                      verbose=False)
Пример #4
0
                              output_class=False)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_mse = mean_squared_error(y_validation, fil_preds)

    assert fil_mse == pytest.approx(xgb_mse, 0.01)
    assert array_equal(fil_preds, xgb_preds)


@pytest.mark.parametrize('n_rows', [1000])
@pytest.mark.parametrize('n_columns', [20])
@pytest.mark.parametrize('n_estimators', [1, 10])
@pytest.mark.parametrize('max_depth', [2, 10, 20])
@pytest.mark.parametrize('storage_type', ['DENSE', 'SPARSE'])
@pytest.mark.parametrize('model_class',
                         [GradientBoostingClassifier, RandomForestClassifier])
@pytest.mark.skipif(has_treelite() is False, reason="need to install treelite")
def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
                                storage_type, model_class):

    # skip depth 20 for dense tests
    if max_depth == 20 and storage_type == 'DENSE':
        return

    # settings
    classification = True  # change this to false to use regression
    n_categories = 2
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_categories,
Пример #5
0
def all_algorithms():
    """Returns all defined AlgorithmPair objects"""
    algorithms = [
        AlgorithmPair(
            sklearn.cluster.KMeans,
            cuml.cluster.KMeans,
            shared_args=dict(init="random", n_clusters=8, max_iter=300),
            name="KMeans",
            accepts_labels=False,
            accuracy_function=metrics.homogeneity_score,
        ),
        AlgorithmPair(
            sklearn.decomposition.PCA,
            cuml.PCA,
            shared_args=dict(n_components=10),
            name="PCA",
            accepts_labels=False,
        ),
        AlgorithmPair(
            sklearn.decomposition.TruncatedSVD,
            cuml.decomposition.tsvd.TruncatedSVD,
            shared_args=dict(n_components=10),
            name="tSVD",
            accepts_labels=False,
        ),
        AlgorithmPair(
            sklearn.random_projection.GaussianRandomProjection,
            cuml.random_projection.GaussianRandomProjection,
            shared_args=dict(n_components="auto"),
            name="GaussianRandomProjection",
            bench_func=fit_transform,
            accepts_labels=False,
        ),
        AlgorithmPair(
            sklearn.neighbors.NearestNeighbors,
            cuml.neighbors.NearestNeighbors,
            shared_args=dict(n_neighbors=1024),
            cpu_args=dict(algorithm="brute", n_jobs=-1),
            cuml_args={},
            name="NearestNeighbors",
            accepts_labels=False,
            bench_func=fit_kneighbors,
        ),
        AlgorithmPair(
            sklearn.cluster.DBSCAN,
            cuml.DBSCAN,
            shared_args=dict(eps=3, min_samples=2),
            cpu_args=dict(algorithm="brute"),
            name="DBSCAN",
            accepts_labels=False,
        ),
        AlgorithmPair(
            sklearn.linear_model.LinearRegression,
            cuml.linear_model.LinearRegression,
            shared_args={},
            name="LinearRegression",
            accepts_labels=True,
            accuracy_function=metrics.r2_score,
        ),
        AlgorithmPair(
            sklearn.linear_model.ElasticNet,
            cuml.linear_model.ElasticNet,
            shared_args={
                "alpha": 0.1,
                "l1_ratio": 0.5
            },
            name="ElasticNet",
            accepts_labels=True,
            accuracy_function=metrics.r2_score,
        ),
        AlgorithmPair(
            sklearn.linear_model.Lasso,
            cuml.linear_model.Lasso,
            shared_args={},
            name="Lasso",
            accepts_labels=True,
            accuracy_function=metrics.r2_score,
        ),
        AlgorithmPair(
            sklearn.linear_model.Ridge,
            cuml.linear_model.Ridge,
            shared_args={},
            name="Ridge",
            accepts_labels=True,
            accuracy_function=metrics.r2_score,
        ),
        AlgorithmPair(
            sklearn.linear_model.LogisticRegression,
            cuml.linear_model.LogisticRegression,
            shared_args=dict(),  # Use default solvers
            name="LogisticRegression",
            accepts_labels=True,
            accuracy_function=metrics.accuracy_score,
        ),
        AlgorithmPair(
            sklearn.ensemble.RandomForestClassifier,
            cuml.ensemble.RandomForestClassifier,
            shared_args={
                "max_features": 1.0,
                "n_estimators": 10
            },
            name="RandomForestClassifier",
            accepts_labels=True,
            cpu_data_prep_hook=_labels_to_int_hook,
            cuml_data_prep_hook=_labels_to_int_hook,
            accuracy_function=metrics.accuracy_score,
        ),
        AlgorithmPair(
            sklearn.ensemble.RandomForestRegressor,
            cuml.ensemble.RandomForestRegressor,
            shared_args={
                "max_features": 1.0,
                "n_estimators": 10
            },
            name="RandomForestRegressor",
            accepts_labels=True,
            accuracy_function=metrics.r2_score,
        ),
        AlgorithmPair(
            sklearn.manifold.TSNE,
            cuml.manifold.TSNE,
            shared_args=dict(),
            name="TSNE",
            accepts_labels=False,
        ),
        AlgorithmPair(
            None,
            cuml.linear_model.MBSGDClassifier,
            shared_args={},
            cuml_args=dict(eta0=0.005, epochs=100),
            name="MBSGDClassifier",
            accepts_labels=True,
            accuracy_function=cuml.metrics.accuracy_score,
        ),
        AlgorithmPair(
            treelite if has_treelite() else None,
            cuml.ForestInference,
            shared_args=dict(num_rounds=100, max_depth=10),
            cuml_args=dict(
                fil_algo="AUTO",
                output_class=False,
                threshold=0.5,
                storage_type="AUTO",
            ),
            name="FIL",
            accepts_labels=False,
            setup_cpu_func=_build_treelite_classifier,
            setup_cuml_func=_build_fil_classifier,
            cpu_data_prep_hook=_treelite_format_hook,
            accuracy_function=_treelite_fil_accuracy_score,
            bench_func=predict,
        ),
        AlgorithmPair(
            treelite if has_treelite() else None,
            cuml.ForestInference,
            shared_args=dict(n_estimators=100, max_leaf_nodes=2**10),
            cuml_args=dict(
                fil_algo="AUTO",
                output_class=False,
                threshold=0.5,
                storage_type="SPARSE",
            ),
            name="Sparse-FIL-SKL",
            accepts_labels=False,
            setup_cpu_func=_build_cpu_skl_classifier,
            setup_cuml_func=_build_fil_skl_classifier,
            accuracy_function=_treelite_fil_accuracy_score,
            bench_func=predict,
        ),
    ]

    if has_umap():
        algorithms.append(
            AlgorithmPair(
                umap.UMAP,
                cuml.manifold.UMAP,
                shared_args=dict(n_neighbors=5, n_epochs=500),
                name="UMAP",
                accepts_labels=False,
                accuracy_function=cuml.metrics.trustworthiness,
            ))

    return algorithms
Пример #6
0
import tempfile

from cuml.benchmark.bench_helper_funcs import (
    fit,
    fit_kneighbors,
    fit_transform,
    predict,
    _build_cpu_skl_classifier,
    _build_fil_skl_classifier,
    _build_fil_classifier,
    _build_treelite_classifier,
    _treelite_fil_accuracy_score,
)
from cuml.utils.import_utils import has_treelite

if has_treelite():
    import treelite
    import treelite.runtime

if has_umap():
    import umap


class AlgorithmPair:
    """
    Wraps a cuML algorithm and (optionally) a cpu-based algorithm
    (typically scikit-learn, but does not need to be as long as it offers
    `fit` and `predict` or `transform` methods).
    Provides mechanisms to run each version with default arguments.
    If no CPU-based version of the algorithm is available, pass None for the
    cpu_class when instantiating