def test_multiclass(): X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4) X = da.from_array(X, chunks=50) y = da.from_array(y, chunks=50) if SK_GE_020: kwargs = {"multi_class": "auto"} else: kwargs = {} clf = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", **kwargs) ) clf.fit(X, y) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected)
def test_multiclass(): X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4) X = da.from_array(X, chunks=50) y = da.from_array(y, chunks=50) clf = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", multi_class="auto")) clf.fit(*dask.compute(X, y)) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_log_proba(X) expected = clf.estimator.predict_log_proba(X) assert_eq_ar(result, expected)
def test_predict(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs") wrap = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")) base.fit(*dask.compute(X, y)) wrap.fit(*dask.compute(X, y)) assert_estimator_equal(wrap.estimator, base) result = wrap.predict(X) expected = base.predict(X) assert_eq_ar(result, expected) result = wrap.predict_proba(X) expected = base.predict_proba(X) assert_eq_ar(result, expected) result = wrap.predict_log_proba(X) expected = base.predict_log_proba(X) assert_eq_ar(result, expected)
def test_laziness(): clf = ParallelPostFit(LinearRegression()) X, y = make_classification(chunks=50) clf.fit(X, y) x = clf.score(X, y, compute=False) assert dask.is_dask_collection(x) assert 0 < x.compute() < 1
def test_it_works(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, chunks=100) clf.fit(X, y) assert isinstance(clf.predict(X), da.Array) assert isinstance(clf.predict_proba(X), da.Array)
def test_no_method_raises(): clf = ParallelPostFit(LinearRegression()) X, y = make_classification(chunks=50) clf.fit(X, y) with pytest.raises(AttributeError) as m: clf.predict_proba(X) assert m.match("The wrapped estimator (.|\n)* 'predict_proba' method.")
def test_auto_rechunk(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, n_features=20, chunks=100) X = X.rechunk({0: 100, 1: 10}) clf.fit(X, y) assert clf.predict(X).compute().shape == (1000,) assert clf.predict_proba(X).compute().shape == (1000, 2) assert clf.score(X, y) == clf.score(X.compute(), y.compute())
def test_it_works(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, chunks=100) X_, y_ = dask.compute(X, y) clf.fit(X_, y_) assert isinstance(clf.predict(X), da.Array) assert isinstance(clf.predict_proba(X), da.Array) result = clf.score(X, y) expected = clf.estimator.score(X_, y_) assert result == expected
def test_multiclass(): X, y = make_classification(chunks=50, n_classes=3, n_informative=4) clf = ParallelPostFit(LogisticRegression(random_state=0)) clf.fit(X, y) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected)
def train(self, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, verbose: bool = True, optimize: bool = False): X_train_prepared = self._preprocess_dataset(X_train) clf = ParallelPostFit(self.classifier, scoring='accuracy') self.classifier = clf.fit(X_train_prepared, y_train) X_test_prepared = self._preprocess_dataset(X_test) prediction = self.classifier.predict_proba(X_test_prepared) y_proba_list = [ self._predict_proba_to_label(proba).value for proba in prediction ] if verbose: self.evaluate(y_proba_list, y_test, classes=self.classifier.classes_) if optimize: opt_classifier = ImageModelOptimiser(self).optimize( X_train, y_train) self.classifier = opt_classifier.classifier
def test_transform(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = PCA(random_state=0) wrap = ParallelPostFit(PCA(random_state=0)) base.fit(*dask.compute(X, y)) wrap.fit(*dask.compute(X, y)) assert_estimator_equal(wrap.estimator, base) result = base.transform(*dask.compute(X)) expected = wrap.transform(X) assert_eq_ar(result, expected)
def test_predict(kind): X, y = make_classification(chunks=100) if kind == 'numpy': X, y = dask.compute(X, y) elif kind == 'dask.dataframe': X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = LogisticRegression(random_state=0) wrap = ParallelPostFit(LogisticRegression(random_state=0)) base.fit(X, y) wrap.fit(X, y) assert_estimator_equal(wrap.estimator, base) result = wrap.predict(X) expected = base.predict(X) assert_eq_ar(result, expected) result = wrap.predict_proba(X) expected = base.predict_proba(X) assert_eq_ar(result, expected)
def test_warning_on_dask_array_without_array_function(): X, y = make_classification(n_samples=10, n_features=2, chunks=10) clf = ParallelPostFit(GradientBoostingClassifier()) clf = clf.fit(X, y) class FakeArray: def __init__(self, value): self.value = value @property def ndim(self): return self.value.ndim @property def len(self): return self.value.len @property def dtype(self): return self.value.dtype @property def shape(self): return self.value.shape ar = FakeArray(np.zeros(shape=(2, 2))) fake_dask_ar = da.from_array(ar) fake_dask_ar._meta = FakeArray(np.zeros(shape=(0, 0))) with pytest.warns( UserWarning, match="provide explicit `predict_meta` to the dask_ml.wrapper"): clf.predict(fake_dask_ar) with pytest.warns( UserWarning, match= "provide explicit `predict_proba_meta` to the dask_ml.wrapper", ): clf.predict_proba(fake_dask_ar)
def train_model(x_train, y_train): clf = ParallelPostFit(estimator=GaussianNB(), scoring='accuracy') clf.fit(x_train, y_train) return clf
# Scale up: connect to your own cluster with bmore resources # see http://dask.pydata.org/en/latest/setup.html client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB') print(client) dtype = { 'total': np.float64, 'temperature': np.int32, 'humidity': np.float64, 'solar': np.float64, 'car_connected': np.int32, 'car_energy': np.int32, 'battery_energy': np.int32, 'current_temperature': np.int32, 'b': np.int32, 'c': np.int32, 'air': np.int32, 'cost': np.int32 } x = pd.read_csv('train_data.csv', dtype=dtype) y = x.pop('cost').values mlp = ParallelPostFit(neural_network.MLPRegressor(hidden_layer_sizes=(16,), solver='adam'), scoring="r2") print('Training') mlp.fit(x, y) print('Finished')
While only predict is demonstrated here, wrappers.ParallelPostFit is equally useful for predict_proba and transform. """ from timeit import default_timer as tic import pandas as pd import seaborn as sns import sklearn.datasets from sklearn.svm import SVC import dask_ml.datasets from dask_ml.wrappers import ParallelPostFit X, y = sklearn.datasets.make_classification(n_samples=1000) clf = ParallelPostFit(SVC(gamma='scale')) clf.fit(X, y) Ns = [100_000, 200_000, 400_000, 800_000] timings = [] for n in Ns: X, y = dask_ml.datasets.make_classification(n_samples=n, random_state=n, chunks=n // 20) t1 = tic() # Serial scikit-learn version clf.estimator.predict(X) timings.append(('Scikit-Learn', n, tic() - t1)) t1 = tic() # Parallelized scikit-learn version
def test_sklearn(): from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import SGDClassifier, LogisticRegressionCV from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.externals import joblib from sklearn.datasets import make_classification, load_digits, fetch_20newsgroups from dask_ml.wrappers import ParallelPostFit categories = [ 'alt.atheism', 'talk.religion.misc', ] print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(max_iter=1000)), ]) parameters = { 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), # 'clf__alpha': (0.00001, 0.000001), # 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, refit=False, iid=False) grid_search.fit(data.data, data.target) with joblib.parallel_backend('dask'): grid_search.fit(data.data, data.target) X, y = load_digits(return_X_y=True) svc = ParallelPostFit(SVC(random_state=0, gamma='scale')) param_grid = { # use estimator__param instead of param 'estimator__C': [0.01, 1.0, 10], } grid_search = GridSearchCV(svc, param_grid, iid=False, cv=3) grid_search.fit(X, y) big_X = da.concatenate( [da.from_array(X, chunks=X.shape) for _ in range(10)]) predicted = grid_search.predict(big_X) # X_train, y_train = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, n_samples=1000) N = 100 X_large = da.concatenate( [da.from_array(X_train, chunks=X_train.shape) for _ in range(N)]) y_large = da.concatenate( [da.from_array(y_train, chunks=y_train.shape) for _ in range(N)]) clf = ParallelPostFit(LogisticRegressionCV(cv=3)) clf.fit(X_train, y_train) y_pred = clf.predict(X_large) clf.score(X_large, y_large) # est.partial_fit(X_train_1, y_train_1) # from tpot import TPOTClassifier pass