def test_svm_multiple_nodes(ray_start_cluster_2_nodes): digits = load_digits() param_space = { "C": np.logspace(-6, 6, 30), "gamma": np.logspace(-8, 8, 30), "tol": np.logspace(-4, -1, 30), "class_weight": [None, "balanced"], } class MockParallel(joblib.Parallel): def _terminate_backend(self): if self._backend is not None: # test ObjectRef caching (PR #16879) assert any(o is digits.data for o, ref in self._backend._pool._registry) self._backend.terminate() model = SVC(kernel="rbf") with mock.patch("sklearn.model_selection._search.Parallel", MockParallel): search = RandomizedSearchCV(model, param_space, cv=5, n_iter=2, verbose=10) register_ray() with joblib.parallel_backend("ray"): search.fit(digits.data, digits.target) assert ray.is_initialized()
def test_task_to_actor_assignment(ray_start_4_cpu): register_ray() pause_time = 5 def worker_func(worker_id): launch_time = time.time() time.sleep(pause_time) return worker_id, launch_time num_workers = 4 output = [] with parallel_backend("ray", n_jobs=-1): output = Parallel()(delayed(worker_func)(worker_id) for worker_id in range(num_workers)) worker_ids = set() launch_times = [] for worker_id, launch_time in output: worker_ids.add(worker_id) launch_times.append(launch_time) assert len(worker_ids) == num_workers for i in range(num_workers): for j in range(i + 1, num_workers): assert abs(launch_times[i] - launch_times[j]) < 1
def test_cross_validation(shutdown_only): register_ray() iris = load_iris() clf = SVC(kernel="linear", C=1, random_state=0) with joblib.parallel_backend("ray", n_jobs=5): accuracy = cross_val_score(clf, iris.data, iris.target, cv=5) assert len(accuracy) == 5 for result in accuracy: assert result > 0.95
def rf_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series: from sklearn.ensemble import RandomForestClassifier param_model = {'n_estimators': [25, 50, 100, 150, 200, 250, 300, 350]} model = GridSearchCV(RandomForestClassifier(oob_score=True, random_state=1, warm_start=True, n_jobs=-1), param_grid=param_model, scoring='accuracy', n_jobs=-1) register_ray() with joblib.parallel_backend('ray'): model = model.fit(X_train, y_train) return model.predict(X_test)
def gb_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series: from sklearn.ensemble import GradientBoostingClassifier param_model = {'n_estimators': [150, 200, 250, 300, 350], 'learning_rate': [0.05, 0.1, 0.2]} model = GridSearchCV(GradientBoostingClassifier(random_state=1), param_grid=param_model, scoring='accuracy', n_jobs=1) register_ray() with joblib.parallel_backend('ray'): model = model.fit(X_train, y_train) return model.predict(X_test)
def maybe_ray(): with ExitStack() as stack: if "RAY_ADDRESS" in os.environ: import joblib from ray.util.joblib import register_ray logger.debug( "Using RAY_ADDRESS=%s as joblib backend", os.environ["RAY_ADDRESS"] ) register_ray() stack.enter_context(joblib.parallel_backend("ray")) yield
def test_ray_remote_args(shutdown_only): ray.init(num_cpus=4, resources={"custom_resource": 4}) register_ray() assert ray.available_resources().get("custom_resource", 0) == 4 def check_resource(): assert ray.available_resources().get("custom_resource", 0) < 4 with joblib.parallel_backend( "ray", ray_remote_args={"resources": {"custom_resource": 1}} ): joblib.Parallel()(joblib.delayed(check_resource)() for i in range(8))
def ab_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series: from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier param_model = {'n_estimators': [50, 100, 150, 200, 250, 300], 'learning_rate': [0.5, 1.0, 2.0]} model = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=1)), param_grid=param_model, scoring='accuracy', n_jobs=1) register_ray() with joblib.parallel_backend('ray'): model = model.fit(X_train, y_train) return model.predict(X_test)
def dt_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series: from sklearn.tree import DecisionTreeClassifier param_model = {'max_depth':range(10, 20), 'max_features': range(3,11)} model = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid=param_model, scoring='accuracy', n_jobs=-1) register_ray() with joblib.parallel_backend('ray'): model = model.fit(X_train, y_train) return model.predict(X_test)
def svm_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series: from sklearn.svm import SVC param_model = {'C': [0.1, 1, 10, 50, 100, 250, 500, 1000], 'gamma': [1, 0.5, 0.25, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf', 'sigmoid']} model = GridSearchCV(SVC(), param_model, scoring='accuracy', n_jobs=1) register_ray() with joblib.parallel_backend('ray'): model = model.fit(X_train, y_train) return model.predict(X_test)
def test_svm_multiple_nodes(ray_start_cluster_2_nodes): digits = load_digits() param_space = { "C": np.logspace(-6, 6, 30), "gamma": np.logspace(-8, 8, 30), "tol": np.logspace(-4, -1, 30), "class_weight": [None, "balanced"], } model = SVC(kernel="rbf") search = RandomizedSearchCV(model, param_space, cv=5, n_iter=2, verbose=10) register_ray() with joblib.parallel_backend("ray"): search.fit(digits.data, digits.target) assert ray.is_initialized()
def ensemble_model(self): clf1 = svm.SVC(probability=True) clf2 = RandomForestClassifier(n_estimators=100) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], use_probas=True, average_probas=False, meta_classifier=lr) pipe = Pipeline([ ("selector", ColumnTransformer( [("selector", "passthrough", self.selected_feat)], remainder="drop")), ('scale', StandardScaler()), ('ensemble_model', sclf) ]) ## distributed training ## register_ray() with parallel_backend("threading", n_jobs=4): pipe.fit(self.X_train, self.y_train) joblib.dump(pipe, model_output_path) y_pred = pipe.predict(self.X_test) # AUC ROC Curve values is considered much more important than the accuracy to evaluate the model predicting_probabilites = pipe.predict_proba(self.X_test)[:, 1] fpr, tpr, thresholds = roc_curve(self.y_test, predicting_probabilites) plt.figure(figsize=(14, 12)) plt.subplot(222) plt.plot(fpr, tpr, label=("Area_under the curve :", auc(fpr, tpr)), color="r") plt.plot([1, 0], [1, 0], linestyle="dashed", color="k") plt.legend(loc="best") plt.title("ROC - CURVE & AREA UNDER CURVE", fontsize=20) plt.savefig(roc_image_path) print('Accuracy of an ensemble model:{:.2f}'.format( accuracy_score(self.y_test, y_pred))) print('Precision of an ensemble model:{:.2f}'.format( precision_score(self.y_test, y_pred))) print('Recall of an ensemble model:{:.2f}'.format( recall_score(self.y_test, y_pred))) print('F1score of an ensemble model:{:.2f}'.format( f1_score(self.y_test, y_pred)))
def test_svm_single_node(shutdown_only): digits = load_digits() param_space = { "C": np.logspace(-6, 6, 10), "gamma": np.logspace(-8, 8, 10), "tol": np.logspace(-4, -1, 3), "class_weight": [None, "balanced"], } model = SVC(kernel="rbf") search = RandomizedSearchCV(model, param_space, cv=3, n_iter=50, verbose=10) register_ray() with joblib.parallel_backend("ray"): search.fit(digits.data, digits.target) assert ray.is_initialized()
def predict( self, data: DataBatchType, feature_columns: Optional[Union[List[str], List[int]]] = None, num_estimator_cpus: Optional[int] = 1, **predict_kwargs, ) -> pd.DataFrame: """Run inference on data batch. Args: data: A batch of input data. Either a pandas DataFrame or numpy array. feature_columns: The names or indices of the columns in the data to use as features to predict on. If None, then use all columns in ``data``. num_estimator_cpus: If set to a value other than None, will set the values of all ``n_jobs`` and ``thread_count`` parameters in the estimator (including in nested objects) to the given value. **predict_kwargs: Keyword arguments passed to ``estimator.predict``. Examples: .. code-block:: python import numpy as np from sklearn.ensemble import RandomForestClassifier from ray.air.predictors.sklearn import SklearnPredictor train_X = np.array([[1, 2], [3, 4]]) train_y = np.array([0, 1]) model = RandomForestClassifier().fit(train_X, train_y) predictor = SklearnPredictor(model=model) data = np.array([[1, 2], [3, 4]]) predictions = predictor.predict(data) # Only use first and second column as the feature data = np.array([[1, 2, 8], [3, 4, 9]]) predictions = predictor.predict(data, feature_columns=[0, 1]) .. code-block:: python import pandas as pd from sklearn.ensemble import RandomForestClassifier from ray.air.predictors.sklearn import SklearnPredictor train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) train_y = pd.Series([0, 1]) model = RandomForestClassifier().fit(train_X, train_y) predictor = SklearnPredictor(model=model) # Pandas dataframe. data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) predictions = predictor.predict(data) # Only use first and second column as the feature data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"]) predictions = predictor.predict(data, feature_columns=["A", "B"]) Returns: pd.DataFrame: Prediction result. """ register_ray() if self.preprocessor: data = self.preprocessor.transform_batch(data) if num_estimator_cpus: set_cpu_params(self.estimator, num_estimator_cpus) if feature_columns: if isinstance(data, np.ndarray): data = data[:, feature_columns] else: data = data[feature_columns] with parallel_backend("ray", n_jobs=num_estimator_cpus): df = pd.DataFrame(self.estimator.predict(data, **predict_kwargs)) df.columns = (["predictions"] if len(df.columns) == 1 else [f"predictions_{i}" for i in range(len(df.columns))]) return df
def main(backend, address, mib, refit, jobs): X, y = load_data(mib) n_features = 2 ** 18 pipeline = Pipeline([ ('vect', HashingVectorizer(n_features=n_features, alternate_sign=False)), ('clf', SGDClassifier()), ]) parameters = { 'vect__norm': ('l1', 'l2'), 'vect__ngram_range': ((1, 1), (1, 2)), 'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5), 'clf__max_iter': (10, 30, 50, 80), 'clf__penalty': ('l2', 'l1', 'elasticnet') } if backend == 'lithops': from sklearn.model_selection import GridSearchCV from lithops.util.joblib import register_lithops register_lithops() grid_search = GridSearchCV(pipeline, parameters, error_score='raise', refit=refit, cv=5, n_jobs=jobs) elif backend == 'ray': from sklearn.model_selection import GridSearchCV import ray from ray.util.joblib import register_ray address = 'auto' if address is None else address ray.init(address, redis_password='******') register_ray() grid_search = GridSearchCV(pipeline, parameters, error_score='raise', refit=refit, cv=5, n_jobs=jobs) elif backend == 'tune': from tune_sklearn import TuneGridSearchCV import ray address = 'auto' if address is None else address ray.init(address, log_to_driver=False, redis_password='******') grid_search = TuneGridSearchCV(pipeline, parameters, error_score='raise', refit=refit, cv=5, n_jobs=jobs) backend = 'loky' # not used elif backend == 'dask': from dask_ml.model_selection import GridSearchCV from dask_ml.feature_extraction.text import HashingVectorizer as DaskHashingVectorizer from distributed import Client if address is None: print('Error: must specify a scheduler address for dask distributed') exit(1) Client(address=address) pipeline = Pipeline([ ('vect', DaskHashingVectorizer(n_features=n_features, alternate_sign=False)), ('clf', SGDClassifier()), ]) grid_search = GridSearchCV(pipeline, parameters, error_score='raise', refit=refit, cv=5, n_jobs=jobs) else: # loky from sklearn.model_selection import GridSearchCV grid_search = GridSearchCV(pipeline, parameters, error_score='raise', refit=refit, cv=5, n_jobs=jobs) print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters: ", end='') pprint(parameters) with joblib.parallel_backend(backend): print("Performing grid search...") t0 = time() grid_search.fit(X, y) total_time = time() - t0 print("Done in %0.3fs\n" % total_time) if refit: print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def training_loop(self) -> None: register_ray() self.estimator.set_params(**self.params) datasets = self._get_datasets() X_train, y_train = datasets.pop(TRAIN_DATASET_KEY) groups = None if "cv_groups" in X_train.columns: groups = X_train["cv_groups"] X_train = X_train.drop("cv_groups", axis=1) scaling_config_dataclass = self._validate_and_get_scaling_config_data_class( self.scaling_config) num_workers = scaling_config_dataclass.num_workers or 0 assert num_workers == 0 # num_workers is not in scaling config allowed_keys trainer_resources = scaling_config_dataclass.trainer_resources or { "CPU": 1 } has_gpus = bool(trainer_resources.get("GPU", 0)) num_cpus = int(trainer_resources.get("CPU", 1)) # see https://scikit-learn.org/stable/computing/parallelism.html os.environ["OMP_NUM_THREADS"] = str(num_cpus) os.environ["MKL_NUM_THREADS"] = str(num_cpus) os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus) os.environ["BLIS_NUM_THREADS"] = str(num_cpus) parallelize_cv = self._get_cv_parallelism(has_gpus) if self.set_estimator_cpus: num_estimator_cpus = 1 if parallelize_cv else num_cpus set_cpu_params(self.estimator, num_estimator_cpus) with parallel_backend("ray", n_jobs=num_cpus): start_time = time() self.estimator.fit(X_train, y_train, **self.fit_params) fit_time = time() - start_time with tune.checkpoint_dir(step=1) as checkpoint_dir: with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f: cpickle.dump(self.estimator, f) if self.preprocessor: save_preprocessor_to_dir(self.preprocessor, checkpoint_dir) if self.label_column: validation_set_scores = self._score_on_validation_sets( self.estimator, datasets) cv_scores = self._score_cv( self.estimator, X_train, y_train, groups, # if estimator has parallelism, use that. Otherwise, # parallelize CV n_jobs=1 if not parallelize_cv else num_cpus, ) else: validation_set_scores = {} cv_scores = {} # cv_scores will not override validation_set_scores as we # check for that during initialization results = { **validation_set_scores, **cv_scores, "fit_time": fit_time, } tune.report(**results)
def test_sklearn_benchmarks(ray_start_cluster_2_nodes): ESTIMATORS = { "CART": DecisionTreeClassifier(), "ExtraTrees": ExtraTreesClassifier(n_estimators=10), "RandomForest": RandomForestClassifier(), "Nystroem-SVM": make_pipeline(Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=1)), "SampledRBF-SVM": make_pipeline(RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=1)), "LogisticRegression-SAG": LogisticRegression(solver="sag", tol=1e-1, C=1e4), "LogisticRegression-SAGA": LogisticRegression(solver="saga", tol=1e-1, C=1e4), "MultilayerPerceptron": MLPClassifier(hidden_layer_sizes=(32, 32), max_iter=100, alpha=1e-4, solver="sgd", learning_rate_init=0.2, momentum=0.9, verbose=1, tol=1e-2, random_state=1), "MLP-adam": MLPClassifier(hidden_layer_sizes=(32, 32), max_iter=100, alpha=1e-4, solver="adam", learning_rate_init=0.001, verbose=1, tol=1e-2, random_state=1) } # Load dataset. print("Loading dataset...") data = fetch_openml("mnist_784") X = check_array(data["data"], dtype=np.float32, order="C") y = data["target"] # Normalize features. X = X / 255 # Create train-test split. print("Creating train-test split...") n_train = 6000 X_train = X[:n_train] y_train = y[:n_train] register_ray() train_time = {} random_seed = 0 # Use two workers per classifier. num_jobs = 2 with joblib.parallel_backend("ray"): for name in sorted(ESTIMATORS.keys()): print("Training %s ... " % name, end="") estimator = ESTIMATORS[name] estimator_params = estimator.get_params() estimator.set_params( **{ p: random_seed for p in estimator_params if p.endswith("random_state") }) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=num_jobs) time_start = time.time() estimator.fit(X_train, y_train) train_time[name] = time.time() - time_start print("training", name, "took", train_time[name], "seconds")
def test_ray_backend(shutdown_only): register_ray() from ray.util.joblib.ray_backend import RayBackend with joblib.parallel_backend("ray"): assert type(joblib.parallel.get_active_backend()[0]) == RayBackend
def test_register_ray(): register_ray() assert "ray" in joblib.parallel.BACKENDS assert not ray.is_initialized()
print(df['goodquality'].value_counts()) # Normalize feature variables X_features = X X = StandardScaler().fit_transform(X) # Splitting the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0) param_model = {'max_depth': range(10, 20), 'max_features': range(3, 11)} start = time.time() mode = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid=param_model, scoring='accuracy', n_jobs=-1) register_ray() with joblib.parallel_backend('ray'): model = mode.fit(X_train, y_train) model = model.fit(X_train, y_train) print( f"executed in {time.time() - start}, nodes {model.best_estimator_.tree_.node_count}, " f"max_depth {model.best_estimator_.tree_.max_depth}") y_pred = model.predict(X_test) print(classification_report(y_test, y_pred))
def test_sklearn_benchmarks(ray_start_cluster_2_nodes): ESTIMATORS = { "CART": DecisionTreeClassifier(), "ExtraTrees": ExtraTreesClassifier(n_estimators=10), "RandomForest": RandomForestClassifier(), "Nystroem-SVM": make_pipeline(Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=1)), "SampledRBF-SVM": make_pipeline(RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=1)), "LogisticRegression-SAG": LogisticRegression(solver="sag", tol=1e-1, C=1e4), "LogisticRegression-SAGA": LogisticRegression(solver="saga", tol=1e-1, C=1e4), "MultilayerPerceptron": MLPClassifier(hidden_layer_sizes=(32, 32), max_iter=100, alpha=1e-4, solver="sgd", learning_rate_init=0.2, momentum=0.9, verbose=1, tol=1e-2, random_state=1), "MLP-adam": MLPClassifier(hidden_layer_sizes=(32, 32), max_iter=100, alpha=1e-4, solver="adam", learning_rate_init=0.001, verbose=1, tol=1e-2, random_state=1) } # Load dataset. print("Loading dataset...") unnormalized_X_train, y_train = pickle.load( open( os.path.join(os.path.dirname(__file__), "mnist_784_100_samples.pkl"), "rb")) # Normalize features. X_train = unnormalized_X_train / 255 register_ray() train_time = {} random_seed = 0 # Use two workers per classifier. num_jobs = 2 with joblib.parallel_backend("ray"): for name in sorted(ESTIMATORS.keys()): print("Training %s ... " % name, end="") estimator = ESTIMATORS[name] estimator_params = estimator.get_params() estimator.set_params( **{ p: random_seed for p in estimator_params if p.endswith("random_state") }) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=num_jobs) time_start = time.time() estimator.fit(X_train, y_train) train_time[name] = time.time() - time_start print("training", name, "took", train_time[name], "seconds")