def _build_treelite_classifier(m, data, arg={}, tmpdir=None): """Setup function for treelite classification benchmarking""" from cuml.utils.import_utils import has_treelite, has_xgboost if has_treelite(): import treelite import treelite.runtime else: raise ImportError("No treelite package found") if has_xgboost(): import xgboost as xgb else: raise ImportError("No XGBoost package found") max_depth = arg["max_depth"] num_rounds = arg["num_rounds"] n_feature = data[0].shape[1] train_size = data[0].shape[0] model_name = f"xgb_{max_depth}_{num_rounds}_{n_feature}_{train_size}.model" model_path = os.path.join(tmpdir, model_name) bst = xgb.Booster() bst.load_model(model_path) tl_model = treelite.Model.from_xgboost(bst) tl_model.export_lib(toolchain="gcc", libpath=model_path + "treelite.so", params={'parallel_comp': 40}, verbose=False) return treelite.runtime.Predictor(model_path + "treelite.so", verbose=False)
def _treelite_format_hook(data): """Helper function converting data into treelite format""" from cuml.utils.import_utils import has_treelite if has_treelite(): import treelite import treelite.runtime else: raise ImportError("No treelite package found") return treelite.runtime.Batch.from_npy2d(data[0]), data[1]
def _build_treelite_classifier(m, data, arg={}): """Setup function for treelite classification benchmarking""" from cuml.utils.import_utils import has_treelite, has_xgboost if has_treelite(): import treelite import treelite.runtime else: raise ImportError("No treelite package found") if has_xgboost(): import xgboost as xgb else: raise ImportError("No XGBoost package found") # use maximum 1e5 rows to train the model train_size = min(data[0].shape[0], 100000) dtrain = xgb.DMatrix(data[0][:train_size, :], label=data[1][:train_size]) params = { "silent": 1, "eval_metric": "error", "objective": "binary:logistic" } params.update(arg) max_depth = arg["max_depth"] num_rounds = arg["num_rounds"] n_feature = data[0].shape[1] tmpdir = tempfile.mkdtemp() model_name = f"xgb_{max_depth}_{num_rounds}_{n_feature}_{train_size}.model" model_path = os.path.join(tmpdir, model_name) bst = xgb.train(params, dtrain, num_rounds) tl_model = treelite.Model.from_xgboost(bst) tl_model.export_lib(toolchain="gcc", libpath=model_path + "treelite.so", params={'parallel_comp': 40}, verbose=False) return treelite.runtime.Predictor(model_path + "treelite.so", verbose=False)
output_class=False) fil_preds = np.asarray(fm.predict(X_validation)) fil_mse = mean_squared_error(y_validation, fil_preds) assert fil_mse == pytest.approx(xgb_mse, 0.01) assert array_equal(fil_preds, xgb_preds) @pytest.mark.parametrize('n_rows', [1000]) @pytest.mark.parametrize('n_columns', [20]) @pytest.mark.parametrize('n_estimators', [1, 10]) @pytest.mark.parametrize('max_depth', [2, 10, 20]) @pytest.mark.parametrize('storage_type', ['DENSE', 'SPARSE']) @pytest.mark.parametrize('model_class', [GradientBoostingClassifier, RandomForestClassifier]) @pytest.mark.skipif(has_treelite() is False, reason="need to install treelite") def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth, storage_type, model_class): # skip depth 20 for dense tests if max_depth == 20 and storage_type == 'DENSE': return # settings classification = True # change this to false to use regression n_categories = 2 random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_categories,
def all_algorithms(): """Returns all defined AlgorithmPair objects""" algorithms = [ AlgorithmPair( sklearn.cluster.KMeans, cuml.cluster.KMeans, shared_args=dict(init="random", n_clusters=8, max_iter=300), name="KMeans", accepts_labels=False, accuracy_function=metrics.homogeneity_score, ), AlgorithmPair( sklearn.decomposition.PCA, cuml.PCA, shared_args=dict(n_components=10), name="PCA", accepts_labels=False, ), AlgorithmPair( sklearn.decomposition.TruncatedSVD, cuml.decomposition.tsvd.TruncatedSVD, shared_args=dict(n_components=10), name="tSVD", accepts_labels=False, ), AlgorithmPair( sklearn.random_projection.GaussianRandomProjection, cuml.random_projection.GaussianRandomProjection, shared_args=dict(n_components="auto"), name="GaussianRandomProjection", bench_func=fit_transform, accepts_labels=False, ), AlgorithmPair( sklearn.neighbors.NearestNeighbors, cuml.neighbors.NearestNeighbors, shared_args=dict(n_neighbors=1024), cpu_args=dict(algorithm="brute", n_jobs=-1), cuml_args={}, name="NearestNeighbors", accepts_labels=False, bench_func=fit_kneighbors, ), AlgorithmPair( sklearn.cluster.DBSCAN, cuml.DBSCAN, shared_args=dict(eps=3, min_samples=2), cpu_args=dict(algorithm="brute"), name="DBSCAN", accepts_labels=False, ), AlgorithmPair( sklearn.linear_model.LinearRegression, cuml.linear_model.LinearRegression, shared_args={}, name="LinearRegression", accepts_labels=True, accuracy_function=metrics.r2_score, ), AlgorithmPair( sklearn.linear_model.ElasticNet, cuml.linear_model.ElasticNet, shared_args={ "alpha": 0.1, "l1_ratio": 0.5 }, name="ElasticNet", accepts_labels=True, accuracy_function=metrics.r2_score, ), AlgorithmPair( sklearn.linear_model.Lasso, cuml.linear_model.Lasso, shared_args={}, name="Lasso", accepts_labels=True, accuracy_function=metrics.r2_score, ), AlgorithmPair( sklearn.linear_model.Ridge, cuml.linear_model.Ridge, shared_args={}, name="Ridge", accepts_labels=True, accuracy_function=metrics.r2_score, ), AlgorithmPair( sklearn.linear_model.LogisticRegression, cuml.linear_model.LogisticRegression, shared_args=dict(), # Use default solvers name="LogisticRegression", accepts_labels=True, accuracy_function=metrics.accuracy_score, ), AlgorithmPair( sklearn.ensemble.RandomForestClassifier, cuml.ensemble.RandomForestClassifier, shared_args={ "max_features": 1.0, "n_estimators": 10 }, name="RandomForestClassifier", accepts_labels=True, cpu_data_prep_hook=_labels_to_int_hook, cuml_data_prep_hook=_labels_to_int_hook, accuracy_function=metrics.accuracy_score, ), AlgorithmPair( sklearn.ensemble.RandomForestRegressor, cuml.ensemble.RandomForestRegressor, shared_args={ "max_features": 1.0, "n_estimators": 10 }, name="RandomForestRegressor", accepts_labels=True, accuracy_function=metrics.r2_score, ), AlgorithmPair( sklearn.manifold.TSNE, cuml.manifold.TSNE, shared_args=dict(), name="TSNE", accepts_labels=False, ), AlgorithmPair( None, cuml.linear_model.MBSGDClassifier, shared_args={}, cuml_args=dict(eta0=0.005, epochs=100), name="MBSGDClassifier", accepts_labels=True, accuracy_function=cuml.metrics.accuracy_score, ), AlgorithmPair( treelite if has_treelite() else None, cuml.ForestInference, shared_args=dict(num_rounds=100, max_depth=10), cuml_args=dict( fil_algo="AUTO", output_class=False, threshold=0.5, storage_type="AUTO", ), name="FIL", accepts_labels=False, setup_cpu_func=_build_treelite_classifier, setup_cuml_func=_build_fil_classifier, cpu_data_prep_hook=_treelite_format_hook, accuracy_function=_treelite_fil_accuracy_score, bench_func=predict, ), AlgorithmPair( treelite if has_treelite() else None, cuml.ForestInference, shared_args=dict(n_estimators=100, max_leaf_nodes=2**10), cuml_args=dict( fil_algo="AUTO", output_class=False, threshold=0.5, storage_type="SPARSE", ), name="Sparse-FIL-SKL", accepts_labels=False, setup_cpu_func=_build_cpu_skl_classifier, setup_cuml_func=_build_fil_skl_classifier, accuracy_function=_treelite_fil_accuracy_score, bench_func=predict, ), ] if has_umap(): algorithms.append( AlgorithmPair( umap.UMAP, cuml.manifold.UMAP, shared_args=dict(n_neighbors=5, n_epochs=500), name="UMAP", accepts_labels=False, accuracy_function=cuml.metrics.trustworthiness, )) return algorithms
import tempfile from cuml.benchmark.bench_helper_funcs import ( fit, fit_kneighbors, fit_transform, predict, _build_cpu_skl_classifier, _build_fil_skl_classifier, _build_fil_classifier, _build_treelite_classifier, _treelite_fil_accuracy_score, ) from cuml.utils.import_utils import has_treelite if has_treelite(): import treelite import treelite.runtime if has_umap(): import umap class AlgorithmPair: """ Wraps a cuML algorithm and (optionally) a cpu-based algorithm (typically scikit-learn, but does not need to be as long as it offers `fit` and `predict` or `transform` methods). Provides mechanisms to run each version with default arguments. If no CPU-based version of the algorithm is available, pass None for the cpu_class when instantiating