예제 #1
0
def test_auto_rechunk():
    clf = ParallelPostFit(GradientBoostingClassifier())
    X, y = make_classification(n_samples=1000, n_features=20, chunks=100)
    X = X.rechunk({0: 100, 1: 10})
    clf.fit(X, y)

    assert clf.predict(X).compute().shape == (1000,)
    assert clf.predict_proba(X).compute().shape == (1000, 2)
    assert clf.score(X, y) == clf.score(X.compute(), y.compute())
예제 #2
0
def test_laziness():
    clf = ParallelPostFit(LinearRegression())
    X, y = make_classification(chunks=50)
    clf.fit(X, y)

    x = clf.score(X, y, compute=False)
    assert dask.is_dask_collection(x)
    assert 0 < x.compute() < 1
예제 #3
0
def test_it_works():
    clf = ParallelPostFit(GradientBoostingClassifier())

    X, y = make_classification(n_samples=1000, chunks=100)
    clf.fit(X, y)

    assert isinstance(clf.predict(X), da.Array)
    assert isinstance(clf.predict_proba(X), da.Array)

    result = clf.score(X, y)
    expected = clf.estimator.score(X, y)
    assert result == expected
예제 #4
0
    "module__activation": [
        "relu",
        "elu",
    ],
    "batch_size": [32, 64],
    "optimizer__lr": loguniform(1e-4, 1e-3),
    "optimizer__weight_decay": loguniform(1e-6, 1e-3),
    "optimizer__momentum": uniform(0, 1),
    "optimizer__nesterov": [True],
}

from dask_ml.model_selection import HyperbandSearchCV
search = HyperbandSearchCV(model,
                           params,
                           random_state=2,
                           verbose=True,
                           max_iter=9)

y_train2 = y_train.reshape(-1, 1).persist()
search.fit(X_train, y_train2)

print(search.best_score_)

print(search.best_params_)

print(search.best_estimator_)

from dask_ml.wrappers import ParallelPostFit
deployed_model = ParallelPostFit(search.best_estimator_)
deployed_model.score(X_test, y_test)
예제 #5
0
def test_sklearn():
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    from sklearn.linear_model import SGDClassifier, LogisticRegressionCV
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    from sklearn.externals import joblib
    from sklearn.datasets import make_classification, load_digits, fetch_20newsgroups

    from dask_ml.wrappers import ParallelPostFit

    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]

    print("Loading 20 newsgroups dataset for categories:")
    print(categories)

    data = fetch_20newsgroups(subset='train', categories=categories)
    print("%d documents" % len(data.filenames))
    print("%d categories" % len(data.target_names))
    print()

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(max_iter=1000)),
    ])

    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        # 'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        # 'tfidf__use_idf': (True, False),
        # 'tfidf__norm': ('l1', 'l2'),
        # 'clf__alpha': (0.00001, 0.000001),
        # 'clf__penalty': ('l2', 'elasticnet'),
        # 'clf__n_iter': (10, 50, 80),
    }

    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               cv=3,
                               refit=False,
                               iid=False)
    grid_search.fit(data.data, data.target)

    with joblib.parallel_backend('dask'):
        grid_search.fit(data.data, data.target)

    X, y = load_digits(return_X_y=True)
    svc = ParallelPostFit(SVC(random_state=0, gamma='scale'))

    param_grid = {
        # use estimator__param instead of param
        'estimator__C': [0.01, 1.0, 10],
    }

    grid_search = GridSearchCV(svc, param_grid, iid=False, cv=3)
    grid_search.fit(X, y)

    big_X = da.concatenate(
        [da.from_array(X, chunks=X.shape) for _ in range(10)])
    predicted = grid_search.predict(big_X)

    #
    X_train, y_train = make_classification(n_features=2,
                                           n_redundant=0,
                                           n_informative=2,
                                           random_state=1,
                                           n_clusters_per_class=1,
                                           n_samples=1000)

    N = 100
    X_large = da.concatenate(
        [da.from_array(X_train, chunks=X_train.shape) for _ in range(N)])
    y_large = da.concatenate(
        [da.from_array(y_train, chunks=y_train.shape) for _ in range(N)])
    clf = ParallelPostFit(LogisticRegressionCV(cv=3))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_large)
    clf.score(X_large, y_large)

    # est.partial_fit(X_train_1, y_train_1)

    # from tpot import TPOTClassifier
    pass