def test_auto_rechunk(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, n_features=20, chunks=100) X = X.rechunk({0: 100, 1: 10}) clf.fit(X, y) assert clf.predict(X).compute().shape == (1000,) assert clf.predict_proba(X).compute().shape == (1000, 2) assert clf.score(X, y) == clf.score(X.compute(), y.compute())
def test_laziness(): clf = ParallelPostFit(LinearRegression()) X, y = make_classification(chunks=50) clf.fit(X, y) x = clf.score(X, y, compute=False) assert dask.is_dask_collection(x) assert 0 < x.compute() < 1
def test_it_works(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, chunks=100) clf.fit(X, y) assert isinstance(clf.predict(X), da.Array) assert isinstance(clf.predict_proba(X), da.Array) result = clf.score(X, y) expected = clf.estimator.score(X, y) assert result == expected
"module__activation": [ "relu", "elu", ], "batch_size": [32, 64], "optimizer__lr": loguniform(1e-4, 1e-3), "optimizer__weight_decay": loguniform(1e-6, 1e-3), "optimizer__momentum": uniform(0, 1), "optimizer__nesterov": [True], } from dask_ml.model_selection import HyperbandSearchCV search = HyperbandSearchCV(model, params, random_state=2, verbose=True, max_iter=9) y_train2 = y_train.reshape(-1, 1).persist() search.fit(X_train, y_train2) print(search.best_score_) print(search.best_params_) print(search.best_estimator_) from dask_ml.wrappers import ParallelPostFit deployed_model = ParallelPostFit(search.best_estimator_) deployed_model.score(X_test, y_test)
def test_sklearn(): from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import SGDClassifier, LogisticRegressionCV from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.externals import joblib from sklearn.datasets import make_classification, load_digits, fetch_20newsgroups from dask_ml.wrappers import ParallelPostFit categories = [ 'alt.atheism', 'talk.religion.misc', ] print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(max_iter=1000)), ]) parameters = { 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), # 'clf__alpha': (0.00001, 0.000001), # 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, refit=False, iid=False) grid_search.fit(data.data, data.target) with joblib.parallel_backend('dask'): grid_search.fit(data.data, data.target) X, y = load_digits(return_X_y=True) svc = ParallelPostFit(SVC(random_state=0, gamma='scale')) param_grid = { # use estimator__param instead of param 'estimator__C': [0.01, 1.0, 10], } grid_search = GridSearchCV(svc, param_grid, iid=False, cv=3) grid_search.fit(X, y) big_X = da.concatenate( [da.from_array(X, chunks=X.shape) for _ in range(10)]) predicted = grid_search.predict(big_X) # X_train, y_train = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, n_samples=1000) N = 100 X_large = da.concatenate( [da.from_array(X_train, chunks=X_train.shape) for _ in range(N)]) y_large = da.concatenate( [da.from_array(y_train, chunks=y_train.shape) for _ in range(N)]) clf = ParallelPostFit(LogisticRegressionCV(cv=3)) clf.fit(X_train, y_train) y_pred = clf.predict(X_large) clf.score(X_large, y_large) # est.partial_fit(X_train_1, y_train_1) # from tpot import TPOTClassifier pass