def main(): from sklearn.decomposition import PCA # Load random data X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train']) if params.n_components is None: p, n = X_train.shape params.n_components = min((n, (2 + min((n, p))) // 3)) # Create our PCA object pca = PCA(svd_solver=params.svd_solver, whiten=params.whiten, n_components=params.n_components) # Time fit fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params) # Time transform transform_time, _ = bench.measure_function_time( pca.transform, X_train, params=params) bench.print_output(library='sklearn', algorithm='pca', stages=['training', 'transformation'], params=params, functions=['PCA.fit', 'PCA.transform'], times=[fit_time, transform_time], accuracy_type=None, accuracies=[None, None], data=[X_train, X_test], alg_instance=pca)
def main(): from sklearn.ensemble import RandomForestClassifier # Load and convert data X_train, X_test, y_train, y_test = bench.load_data(params) # Create our random forest classifier clf = RandomForestClassifier(criterion=params.criterion, n_estimators=params.num_trees, max_depth=params.max_depth, max_features=params.max_features, min_samples_split=params.min_samples_split, max_leaf_nodes=params.max_leaf_nodes, min_impurity_decrease=params.min_impurity_decrease, bootstrap=params.bootstrap, random_state=params.seed, n_jobs=params.n_jobs) params.n_classes = len(np.unique(y_train)) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) y_pred = clf.predict(X_train) train_acc = 100 * accuracy_score(y_pred, y_train) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) test_acc = 100 * accuracy_score(y_pred, y_test) bench.print_output(library='sklearn', algorithm='decision_forest_classification', stages=['training', 'prediction'], params=params, functions=['df_clsf.fit', 'df_clsf.predict'], times=[fit_time, predict_time], accuracy_type='accuracy[%]', accuracies=[train_acc, test_acc], data=[X_train, X_test], alg_instance=clf)
def main(): from sklearn.linear_model import LogisticRegression # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) params.n_classes = len(np.unique(y_train)) if params.multiclass == 'auto': params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' if not params.tol: params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 # Create our classifier object clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, fit_intercept=params.fit_intercept, verbose=params.verbose, tol=params.tol, max_iter=params.maxiter, solver=params.solver, multi_class=params.multiclass) # Time fit and predict fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) y_pred = clf.predict(X_train) y_proba = clf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) train_log_loss = bench.log_loss(y_train, y_proba) train_roc_auc = bench.roc_auc_score(y_train, y_proba) predict_time, y_pred = bench.measure_function_time(clf.predict, X_test, params=params) y_proba = clf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, y_pred) test_log_loss = bench.log_loss(y_test, y_proba) test_roc_auc = bench.roc_auc_score(y_test, y_proba) bench.print_output( library='sklearn', algorithm='logistic_regression', stages=['training', 'prediction'], params=params, functions=['LogReg.fit', 'LogReg.predict'], times=[fit_time, predict_time], metric_type=['accuracy', 'log_loss', 'roc_auc'], metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], ], data=[X_train, X_test], alg_instance=clf, )
def main(): from sklearn.ensemble import RandomForestRegressor # Load and convert data X_train, X_test, y_train, y_test = bench.load_data(params) # Create our random forest regressor regr = RandomForestRegressor(criterion=params.criterion, n_estimators=params.num_trees, max_depth=params.max_depth, max_features=params.max_features, min_samples_split=params.min_samples_split, max_leaf_nodes=params.max_leaf_nodes, min_impurity_decrease=params.min_impurity_decrease, bootstrap=params.bootstrap, random_state=params.seed, n_jobs=params.n_jobs) fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) y_pred = regr.predict(X_train) train_rmse = bench.rmse_score(y_pred, y_train) predict_time, y_pred = bench.measure_function_time( regr.predict, X_test, params=params) test_rmse = bench.rmse_score(y_pred, y_test) bench.print_output(library='sklearn', algorithm='decision_forest_regression', stages=['training', 'prediction'], params=params, functions=['df_regr.fit', 'df_regr.predict'], times=[fit_time, predict_time], accuracy_type='rmse', accuracies=[train_rmse, test_rmse], data=[X_train, X_test], alg_instance=regr)
def main(): from sklearn.linear_model import ElasticNet # Load data X_train, X_test, y_train, y_test = bench.load_data(params) # Create our regression object regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, alpha=params.alpha, tol=params.tol, max_iter=params.maxiter, copy_X=False) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict predict_time, pred_train = bench.measure_function_time(regr.predict, X_train, params=params) train_rmse = bench.rmse_score(pred_train, y_train) pred_test = regr.predict(X_test) test_rmse = bench.rmse_score(pred_test, y_test) bench.print_output(library='sklearn', algorithm='elastic-net', stages=['training', 'prediction'], params=params, functions=['ElasticNet.fit', 'ElasticNet.predict'], times=[fit_time, predict_time], accuracy_type='rmse', accuracies=[train_rmse, test_rmse], data=[X_train, X_train], alg_instance=regr)
def main(): from sklearn.neighbors import KNeighborsRegressor # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) params.n_classes = len(np.unique(y_train)) # Create a regression object knn_regr = KNeighborsRegressor(n_neighbors=params.n_neighbors, weights=params.weights, algorithm=params.method, metric=params.metric, n_jobs=params.n_jobs) # Measure time and accuracy on fitting train_time, _ = bench.measure_function_time( knn_regr.fit, X_train, y_train, params=params) if params.task == 'regression': y_pred = knn_regr.predict(X_train) train_rmse = bench.rmse_score(y_train, y_pred) train_r2 = bench.r2_score(y_train, y_pred) # Measure time and accuracy on prediction if params.task == 'regression': predict_time, yp = bench.measure_function_time(knn_regr.predict, X_test, params=params) test_rmse = bench.rmse_score(y_test, yp) test_r2 = bench.r2_score(y_test, yp) else: predict_time, _ = bench.measure_function_time(knn_regr.kneighbors, X_test, params=params) if params.task == 'regression': bench.print_output( library='sklearn', algorithm=knn_regr._fit_method + '_knn_regr', stages=['training', 'prediction'], params=params, functions=['knn_regr.fit', 'knn_regr.predict'], times=[train_time, predict_time], metric_type=['rmse', 'r2_score'], metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_test], alg_instance=knn_regr, ) else: bench.print_output( library='sklearn', algorithm=knn_regr._fit_method + '_knn_search', stages=['training', 'search'], params=params, functions=['knn_regr.fit', 'knn_regr.kneighbors'], times=[train_time, predict_time], metric_type=None, metrics=[], data=[X_train, X_test], alg_instance=knn_regr, )
def main(): from sklearn.svm import NuSVR X_train, X_test, y_train, y_test = bench.load_data(params) y_train = np.asfortranarray(y_train).ravel() if params.gamma is None: params.gamma = 1.0 / X_train.shape[1] cache_size_bytes = bench.get_optimal_cache_size( X_train.shape[0], max_cache=params.max_cache_size) params.cache_size_mb = cache_size_bytes / 1024**2 params.n_classes = len(np.unique(y_train)) regr = NuSVR(C=params.C, nu=params.nu, kernel=params.kernel, cache_size=params.cache_size_mb, tol=params.tol, gamma=params.gamma, degree=params.degree) fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) params.sv_len = regr.support_.shape[0] predict_train_time, y_pred = bench.measure_function_time(regr.predict, X_train, params=params) train_rmse = bench.rmse_score(y_train, y_pred) train_r2 = bench.r2_score(y_train, y_pred) _, y_pred = bench.measure_function_time(regr.predict, X_test, params=params) test_rmse = bench.rmse_score(y_test, y_pred) test_r2 = bench.r2_score(y_test, y_pred) bench.print_output( library='sklearn', algorithm='nuSVR', stages=['training', 'prediction'], params=params, functions=['NuSVR.fit', 'NuSVR.predict'], times=[fit_time, predict_train_time], metric_type=['rmse', 'r2_score', 'n_sv'], metrics=[ [train_rmse, test_rmse], [train_r2, test_r2], [int(regr.n_support_.sum()), int(regr.n_support_.sum())], ], data=[X_train, X_train], alg_instance=regr, )
def main(): from sklearn.cluster import KMeans from sklearn.metrics.cluster import davies_bouldin_score # Load and convert generated data X_train, X_test, _, _ = bench.load_data(params) X_init: Any if params.filei == 'k-means++': X_init = 'k-means++' # Load initial centroids from specified path elif params.filei is not None: X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()} if isinstance(X_init, np.ndarray): params.n_clusters = X_init.shape[0] # or choose random centroids from training data else: np.random.seed(params.seed) centroids_idx = np.random.randint(low=0, high=X_train.shape[0], size=params.n_clusters) if hasattr(X_train, "iloc"): X_init = X_train.iloc[centroids_idx].values else: X_init = X_train[centroids_idx] def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, max_iter=params.maxiter, init=X_init, n_init=1) alg.fit(X) return alg # Time fit fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, X_init, params=params) train_predict = kmeans.predict(X_train) acc_train = davies_bouldin_score(X_train, train_predict) # Time predict predict_time, test_predict = bench.measure_function_time( kmeans.predict, X_test, params=params) acc_test = davies_bouldin_score(X_test, test_predict) bench.print_output(library='sklearn', algorithm='kmeans', stages=['training', 'prediction'], params=params, functions=['KMeans.fit', 'KMeans.predict'], times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', accuracies=[acc_train, acc_test], data=[X_train, X_test], alg_instance=kmeans)
def main(): from sklearn.cluster import DBSCAN from sklearn.metrics.cluster import davies_bouldin_score # Load generated data X, _, _, _ = bench.load_data(params, add_dtype=True) # Create our clustering object dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, min_samples=params.min_samples, metric='euclidean', algorithm='auto') # N.B. algorithm='auto' will select oneAPI Data Analytics Library (oneDAL) # brute force method when running daal4py-patched scikit-learn, and probably # 'kdtree' when running unpatched scikit-learn. # Time fit time, _ = bench.measure_function_time(dbscan.fit, X, params=params) labels = dbscan.labels_ params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) acc = davies_bouldin_score(X, labels) bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X], alg_instance=dbscan)
def main(): from sklearn.manifold import TSNE # Load and convert data X, _, _, _ = bench.load_data(params) # Create our TSNE model tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration, learning_rate=params.learning_rate, angle=params.angle, min_grad_norm=params.min_grad_norm, random_state=params.random_state) fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params) divergence = tsne.kl_divergence_ bench.print_output( library='sklearn', algorithm='TSNE', stages=['training'], params=params, functions=['TSNE.fit'], times=[fit_time], metric_type='divergence', metrics=[divergence], data=[X], alg_instance=tsne, )
def main(): from sklearn.model_selection import train_test_split # Load generated data X, y, _, _ = bench.load_data(params) data_args: Iterable if params.include_y: data_args = (X, y) else: data_args = (X, ) tts_params = { 'train_size': params.train_size, 'test_size': params.test_size, 'shuffle': not params.do_not_shuffle, 'random_state': params.seed } if params.rng is not None: tts_params['rng'] = params.rng time, _ = bench.measure_function_time( train_test_split, *data_args, params=params, **tts_params) bench.print_output(library='sklearn', algorithm='train_test_split', stages=['training'], params=params, functions=['train_test_split'], times=[time], metrics=[None], metric_type=None, data=[X], alg_params=tts_params)
def main(): from sklearn.linear_model import Lasso # Load data X_train, X_test, y_train, y_test = bench.load_data(params) # Create our regression object regr = Lasso(fit_intercept=params.fit_intercept, alpha=params.alpha, tol=params.tol, max_iter=params.maxiter, copy_X=False) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict predict_time, yp = bench.measure_function_time(regr.predict, X_train, params=params) train_rmse = bench.rmse_score(y_train, yp) train_r2 = bench.r2_score(y_train, yp) yp = regr.predict(X_test) test_rmse = bench.rmse_score(y_test, yp) test_r2 = bench.r2_score(y_test, yp) bench.print_output( library='sklearn', algorithm='lasso', stages=['training', 'prediction'], params=params, functions=['Lasso.fit', 'Lasso.predict'], times=[fit_time, predict_time], metric_type=['rmse', 'r2_score', 'iter'], metrics=[ [train_rmse, test_rmse], [train_r2, test_r2], [int(regr.n_iter_), int(regr.n_iter_)], ], data=[X_train, X_test], alg_instance=regr, )
def main(): from sklearn.neighbors import KNeighborsClassifier # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) params.n_classes = len(np.unique(y_train)) # Create classification object knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors, weights=params.weights, algorithm=params.method, metric=params.metric, n_jobs=params.n_jobs) # Measure time and accuracy on fitting train_time, _ = bench.measure_function_time(knn_clsf.fit, X_train, y_train, params=params) if params.task == 'classification': y_pred = knn_clsf.predict(X_train) train_acc = 100 * accuracy_score(y_pred, y_train) # Measure time and accuracy on prediction if params.task == 'classification': predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, params=params) test_acc = 100 * accuracy_score(yp, y_test) else: predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, params=params) if params.task == 'classification': bench.print_output(library='sklearn', algorithm=knn_clsf._fit_method + '_knn_classification', stages=['training', 'prediction'], params=params, functions=['knn_clsf.fit', 'knn_clsf.predict'], times=[train_time, predict_time], accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]', data=[X_train, X_test], alg_instance=knn_clsf) else: bench.print_output(library='sklearn', algorithm=knn_clsf._fit_method + '_knn_search', stages=['training', 'search'], params=params, functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], times=[train_time, predict_time], accuracies=[], accuracy_type=None, data=[X_train, X_test], alg_instance=knn_clsf)
def main(): from sklearn.linear_model import LinearRegression # Load data X_train, X_test, y_train, y_test = bench.load_data( params, generated_data=['X_train', 'y_train']) # Create our regression object regr = LinearRegression(fit_intercept=params.fit_intercept, n_jobs=params.n_jobs, copy_X=False) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) test_rmse = bench.rmse_score(y_test, yp) test_r2 = bench.r2_score(y_test, yp) yp = regr.predict(X_train) train_rmse = bench.rmse_score(y_train, yp) train_r2 = bench.r2_score(y_train, yp) bench.print_output( library='sklearn', algorithm='lin_reg', stages=['training', 'prediction'], params=params, functions=['Linear.fit', 'Linear.predict'], times=[fit_time, predict_time], metric_type=['rmse', 'r2_score'], metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_test], alg_instance=regr, )
def main(): from sklearn.linear_model import ElasticNet # Load data X_train, X_test, y_train, y_test = bench.load_data(params) # Create our regression object regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, alpha=params.alpha, tol=params.tol, max_iter=params.maxiter) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict predict_time, y_pred = bench.measure_function_time(regr.predict, X_train, params=params) train_rmse = bench.rmse_score(y_train, y_pred) train_r2 = bench.r2_score(y_train, y_pred) y_pred = regr.predict(X_test) test_rmse = bench.rmse_score(y_test, y_pred) test_r2 = bench.r2_score(y_test, y_pred) bench.print_output( library='sklearn', algorithm='elasticnet', stages=['training', 'prediction'], params=params, functions=['ElasticNet.fit', 'ElasticNet.predict'], times=[fit_time, predict_time], metric_type=['rmse', 'r2_score', 'iter'], metrics=[ [train_rmse, test_rmse], [train_r2, test_r2], [int(regr.n_iter_), int(regr.n_iter_)], ], data=[X_train, X_train], alg_instance=regr, )
def main(): from sklearn.linear_model import Ridge # Load data X_train, X_test, y_train, y_test = bench.load_data( params, generated_data=['X_train', 'y_train']) # Create our regression object regr = Ridge(fit_intercept=params.fit_intercept, alpha=params.alpha, solver=params.solver) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) test_rmse = bench.rmse_score(yp, y_test) yp = regr.predict(X_train) train_rmse = bench.rmse_score(yp, y_train) bench.print_output(library='sklearn', algorithm='ridge_regression', stages=['training', 'prediction'], params=params, functions=['Ridge.fit', 'Ridge.predict'], times=[fit_time, predict_time], accuracy_type='rmse', accuracies=[train_rmse, test_rmse], data=[X_train, X_test], alg_instance=regr)
def main(): from sklearn.metrics.pairwise import pairwise_distances # Load data X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) time, _ = bench.measure_function_time(pairwise_distances, X, metric=params.metric, n_jobs=params.n_jobs, params=params) bench.print_output(library='sklearn', algorithm='distances', stages=['computation'], params=params, functions=[params.metric.capitalize()], times=[time], metric_type=None, metrics=[None], data=[X], alg_params={'metric': params.metric})
min_impurity_decrease=params.min_impurity_decrease, bootstrap=params.bootstrap, ) def fit(regr, X, y): return regr.fit(X, y) def predict(regr, X): return regr.predict(X, predict_model='GPU') fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params) y_pred = predict(regr, X_train) train_rmse = bench.rmse_score(y_pred, y_train) predict_time, y_pred = bench.measure_function_time(predict, regr, X_test, params=params) test_rmse = bench.rmse_score(y_pred, y_test) bench.print_output(library='cuml', algorithm='df_regr', stages=['training', 'prediction'],
parser = argparse.ArgumentParser(description='daal4py pairwise distances ' 'benchmark') parser.add_argument('--metric', default='cosine', choices=['cosine', 'correlation'], help='Metric to test for pairwise distances') params = bench.parse_args(parser) # Load data X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) pairwise_distances = cosine_distance if params.metric == 'cosine' else correlation_distance time, _ = bench.measure_function_time(compute_distances, pairwise_distances, X, params=params) bench.print_output(library='daal4py', algorithm='distances', stages=['computation'], params=params, functions=[params.metric.capitalize()], times=[time], metric_type=None, metrics=[None], data=[X], alg_params={'metric': params.metric})
def main(): from sklearn.svm import SVC X_train, X_test, y_train, y_test = bench.load_data(params) y_train = np.asfortranarray(y_train).ravel() if params.gamma is None: params.gamma = 1.0 / X_train.shape[1] cache_size_bytes = bench.get_optimal_cache_size( X_train.shape[0], max_cache=params.max_cache_size) params.cache_size_mb = cache_size_bytes / 1024**2 params.n_classes = len(np.unique(y_train)) clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, tol=params.tol, gamma=params.gamma, probability=params.probability, random_state=43, degree=params.degree) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] if params.probability: state_predict = 'predict_proba' clf_predict = clf.predict_proba train_acc = None test_acc = None predict_train_time, y_pred = bench.measure_function_time(clf_predict, X_train, params=params) train_log_loss = bench.log_loss(y_train, y_pred) train_roc_auc = bench.roc_auc_score(y_train, y_pred) _, y_pred = bench.measure_function_time(clf_predict, X_test, params=params) test_log_loss = bench.log_loss(y_test, y_pred) test_roc_auc = bench.roc_auc_score(y_test, y_pred) else: state_predict = 'prediction' clf_predict = clf.predict train_log_loss = None test_log_loss = None train_roc_auc = None test_roc_auc = None predict_train_time, y_pred = bench.measure_function_time(clf_predict, X_train, params=params) train_acc = bench.accuracy_score(y_train, y_pred) _, y_pred = bench.measure_function_time(clf_predict, X_test, params=params) test_acc = bench.accuracy_score(y_test, y_pred) bench.print_output( library='sklearn', algorithm='SVC', stages=['training', state_predict], params=params, functions=['SVM.fit', f'SVM.{state_predict}'], times=[fit_time, predict_train_time], metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'], metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], [int(clf.n_support_.sum()), int(clf.n_support_.sum())], ], data=[X_train, X_train], alg_instance=clf, )
params = bench.parse_args(parser) from sklearn.linear_model import Ridge # Load data X_train, X_test, y_train, y_test = bench.load_data( params, generated_data=['X_train', 'y_train']) # Create our regression object regr = Ridge(fit_intercept=params.fit_intercept, alpha=params.alpha, solver=params.solver) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) test_rmse = bench.rmse_score(yp, y_test) yp = regr.predict(X_train) train_rmse = bench.rmse_score(yp, y_train) bench.print_output(library='sklearn', algorithm='ridge_regression', stages=['training', 'prediction'], params=params,
def test_transform(Xp, pca_result, eigenvalues, eigenvectors): return pca_transform_daal(pca_result, Xp, params.n_components, X_train.shape[0], eigenvalues, eigenvectors, whiten=params.whiten) columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'svd_solver', 'n_components', 'whiten', 'time') # Time fit fit_time, res = measure_function_time(test_fit, X_train, params=params) # Time transform transform_time, tr = measure_function_time(test_transform, X_test, *res[:3], params=params) print_output(library='daal4py', algorithm='pca', stages=['training', 'transformation'], columns=columns, params=params, functions=['PCA.fit', 'PCA.transform'], times=[fit_time, transform_time], accuracy_type=None,
if params.objective.startswith('reg'): task = 'regression' metric_name, metric_func = 'rmse', bench.rmse_score else: task = 'classification' metric_name, metric_func = 'accuracy[%]', utils.get_accuracy if 'cudf' in str(type(y_train)): params.n_classes = y_train[y_train.columns[0]].nunique() else: params.n_classes = len(np.unique(y_train)) if params.n_classes > 2: lgbm_params['num_class'] = params.n_classes t_creat_train, lgbm_train = bench.measure_function_time(lgbm.Dataset, X_train, y_train, params=params, free_raw_data=False) t_creat_test, lgbm_test = bench.measure_function_time(lgbm.Dataset, X_test, y_test, params=params, reference=lgbm_train, free_raw_data=False) t_train, model_lgbm = bench.measure_function_time( lgbm.train, lgbm_params, lgbm_train, params=params,
# Load data X_train, X_test, y_train, y_test = load_data(params) # Create our regression object regr = Lasso(fit_intercept=params.fit_intercept, alpha=params.alpha, tol=params.tol, max_iter=params.maxiter, copy_X=False) columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'time') # Time fit fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict predict_time, pred_train = measure_function_time(regr.predict, X_train, params=params) train_rmse = rmse_score(pred_train, y_train) pred_test = regr.predict(X_test) test_rmse = rmse_score(pred_test, y_test) print_output(library='sklearn', algorithm='lasso', stages=['training', 'prediction'], columns=columns, params=params,
clf = RandomForestClassifier( criterion=params.criterion, n_estimators=params.num_trees, max_depth=params.max_depth, max_features=params.max_features, min_samples_split=params.min_samples_split, max_leaf_nodes=params.max_leaf_nodes, min_impurity_decrease=params.min_impurity_decrease, bootstrap=params.bootstrap, random_state=params.seed, n_jobs=params.n_jobs) params.n_classes = len(np.unique(y_train)) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) y_pred = clf.predict(X_train) train_acc = 100 * accuracy_score(y_pred, y_train) predict_time, y_pred = bench.measure_function_time(clf.predict, X_test, params=params) test_acc = 100 * accuracy_score(y_pred, y_test) bench.print_output(library='sklearn', algorithm='decision_forest_classification', stages=['training', 'prediction'], params=params, functions=['df_clsf.fit', 'df_clsf.predict'], times=[fit_time, predict_time],
params = bench.parse_args(parser, prefix='daal4py') # Load data X_train, X_test, y_train, y_test = bench.load_data( params, add_dtype=True, label_2d=True) params.n_classes = len(np.unique(y_train)) if isinstance(params.max_features, float): params.max_features = int(X_train.shape[1] * params.max_features) # Time fit and predict fit_time, res = bench.measure_function_time( df_clsf_fit, X_train, y_train, params.n_classes, n_trees=params.num_trees, n_features_per_node=params.max_features, max_depth=params.max_depth, min_impurity=params.min_impurity_decrease, bootstrap=params.bootstrap, seed=params.seed, params=params) yp = df_clsf_predict(X_train, res, params.n_classes) train_acc = 100 * accuracy_score(yp, y_train) predict_time, yp = bench.measure_function_time( df_clsf_predict, X_test, res, params.n_classes, params=params) test_acc = 100 * accuracy_score(yp, y_test) bench.print_output(library='daal4py', algorithm='decision_forest_classification', stages=['training', 'prediction'], params=params, functions=['df_clsf.fit', 'df_clsf.predict'],
def test_fit(X, y): regr_train = ridge_regression_training(fptype=getFPType(X), ridgeParameters=np.array( [[params.alpha]]), interceptFlag=params.fit_intercept) return regr_train.compute(X, y) def test_predict(Xp, model): regr_predict = ridge_regression_prediction(fptype=getFPType(Xp)) return regr_predict.compute(Xp, model) # Time fit fit_time, res = bench.measure_function_time(test_fit, X_train, y_train, params=params) # Time predict predict_time, yp = bench.measure_function_time(test_predict, X_test, res.model, params=params) test_rmse = bench.rmse_score(yp.prediction, y_test) pres = test_predict(X_train, res.model) train_rmse = bench.rmse_score(pres.prediction, y_train) bench.print_output(library='daal4py', algorithm='ridge_regression', stages=['training', 'prediction'],
metric_name = 'accuracy[%]' metric_func = lambda y1, y2: 100 * accuracy_score(y1, y2) columns += ('n_classes', 'accuracy', 'time') if 'cudf' in str(type(y_train)): params.n_classes = y_train[y_train.columns[0]].nunique() else: params.n_classes = len(np.unique(y_train)) if params.n_classes > 2: xgb_params['num_class'] = params.n_classes dtrain = xgb.DMatrix(X_train, y_train) dtest = xgb.DMatrix(X_test, y_test) fit_time, booster = measure_function_time(xgb.train, xgb_params, dtrain, params.n_estimators, params=params) y_pred = convert_xgb_predictions(booster.predict(dtrain), params.objective) train_metric = metric_func(y_pred, y_train) predict_time, y_pred = measure_function_time(booster.predict, dtest, params=params) test_metric = metric_func(convert_xgb_predictions(y_pred, params.objective), y_test) print_output(library='xgboost', algorithm=f'gradient_boosted_trees_{task}', stages=['training', 'prediction'], columns=columns,
# Workaround for cuML kmeans fail # when second call of 'fit' method causes AttributeError def kmeans_fit(X): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, max_iter=params.maxiter, init=X_init, max_samples_per_batch=params.samples_per_batch) alg.fit(X) return alg # Time fit fit_time, kmeans = measure_function_time(kmeans_fit, X_train, params=params) train_predict = kmeans.predict(X_train) # Time predict predict_time, test_predict = measure_function_time(kmeans.predict, X_test, params=params) X_train_host = convert_to_numpy(X_train) train_predict_host = convert_to_numpy(train_predict) acc_train = davies_bouldin_score(X_train_host, train_predict_host) X_test_host = convert_to_numpy(X_test) test_predict_host = convert_to_numpy(test_predict) acc_test = davies_bouldin_score(X_test_host, test_predict_host)
parser.add_argument('-m', '--min-samples', default=5, type=int, help='The minimum number of samples required in a ' 'neighborhood to consider a point a core point') params = bench.parse_args(parser) # Load generated data X, _, _, _ = bench.load_data(params) # Create our clustering object dbscan = DBSCAN(eps=params.eps, min_samples=params.min_samples) # Time fit time, _ = bench.measure_function_time(dbscan.fit, X, params=params) labels = dbscan.labels_ X_host = bench.convert_to_numpy(X) labels_host = bench.convert_to_numpy(labels) acc = davies_bouldin_score(X_host, labels_host) params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0) bench.print_output(library='cuml', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], metrics=[acc],