예제 #1
0
def test_keras(c, s, a, b):
    # Mirror the mnist dataset
    X, y = make_classification(n_classes=10, n_features=784, n_informative=100)
    X = X.astype("float32")
    assert y.dtype == np.dtype("int64")

    model = KerasClassifier(build_fn=_keras_build_fn, lr=0.01, verbose=False)
    params = {"lr": loguniform(1e-3, 1e-1)}

    search = IncrementalSearchCV(model,
                                 params,
                                 max_iter=3,
                                 n_initial_parameters=5,
                                 decay_rate=None)
    yield search.fit(X, y)
    #  search.fit(X, y)

    assert search.best_score_ >= 0

    # Make sure the model trains, and scores aren't constant
    scores = {
        ident: [h["score"] for h in hist]
        for ident, hist in search.model_history_.items()
    }
    assert all(len(hist) == 3 for hist in scores.values())
    nuniq_scores = [pd.Series(v).nunique() for v in scores.values()]
    assert max(nuniq_scores) > 1
예제 #2
0
 def _test_verbosity(c, s, a, b):
     X, y = make_classification(n_samples=10, n_features=4, chunks=10)
     model = ConstantFunction()
     params = {"value": scipy.stats.uniform(0, 1)}
     search = IncrementalSearchCV(model, params, max_iter=max_iter, verbose=verbose)
     yield search.fit(X, y)
     return search
예제 #3
0
def test_pytorch(c, s, a, b):

    n_features = 10
    defaults = {
        "callbacks": False,
        "warm_start": False,
        "train_split": None,
        "max_epochs": 1,
    }
    model = NeuralNetRegressor(
        module=ShallowNet,
        module__n_features=n_features,
        criterion=nn.MSELoss,
        optimizer=optim.SGD,
        optimizer__lr=0.1,
        batch_size=64,
        **defaults,
    )

    model2 = clone(model)
    assert model.callbacks is False
    assert model.warm_start is False
    assert model.train_split is None
    assert model.max_epochs == 1

    params = {"optimizer__lr": loguniform(1e-3, 1e0)}
    X, y = make_regression(n_samples=100, n_features=n_features)
    X = X.astype("float32")
    y = y.astype("float32").reshape(-1, 1)
    search = IncrementalSearchCV(model2, params, max_iter=5, decay_rate=None)
    yield search.fit(X, y)
    assert search.best_score_ >= 0
예제 #4
0
def test_min_max_iter(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    est = SGDClassifier()
    params = {"alpha": np.logspace(-3, 0)}
    search = IncrementalSearchCV(est, params, max_iter=0)
    with pytest.raises(ValueError, match="max_iter < 1 is not supported"):
        yield search.fit(X, y, classes=[0, 1])
예제 #5
0
def test_numpy_array(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    X, y = yield c.compute([X, y])
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {"alpha": np.logspace(-2, 10, 10), "l1_ratio": np.linspace(0.01, 1, 20)}

    search = IncrementalSearchCV(model, params, n_initial_parameters=10)
    yield search.fit(X, y, classes=[0, 1])
예제 #6
0
def test_small(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {"alpha": [0.1, 0.5, 0.75, 1.0]}
    search = IncrementalSearchCV(model, params, n_initial_parameters="grid")
    yield search.fit(X, y, classes=[0, 1])
    (X_,) = yield c.compute([X])
    search.predict(X_)
예제 #7
0
def test_search_max_iter(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {"alpha": np.logspace(-2, 10, 10), "l1_ratio": np.linspace(0.01, 1, 20)}

    search = IncrementalSearchCV(model, params, n_initial_parameters=10, max_iter=1)
    yield search.fit(X, y, classes=[0, 1])
    for d in search.history_:
        assert d["partial_fit_calls"] <= 1
예제 #8
0
def test_warns_scores_per_fit(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(model, params, scores_per_fit=2)
    with pytest.warns(UserWarning, match="deprecated since Dask-ML v1.4.0"):
        yield search.fit(X, y)
예제 #9
0
def test_transform(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    model = MiniBatchKMeans(random_state=0)
    params = {"n_clusters": [3, 4, 5], "n_init": [1, 2]}
    search = IncrementalSearchCV(model, params, n_initial_parameters="grid")
    yield search.fit(X, y)
    X_, = yield c.compute([X])
    result = search.transform(X_)
    assert result.shape == (100, search.best_estimator_.n_clusters)
예제 #10
0
async def test_smaller(c, s, a, b):
    # infinite loop
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {"alpha": [0.1, 0.5]}
    search = IncrementalSearchCV(model, params, n_initial_parameters="grid")
    await search.fit(X, y, classes=[0, 1])
    X_ = await c.compute(X)
    search.predict(X_)
예제 #11
0
    def test_gridsearch_func(c, s, a, b):
        X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

        model = SGDClassifier(tol=1e-3)

        params = {"alpha": np.logspace(-2, 10, 3), "l1_ratio": np.linspace(0.01, 1, 2)}

        search = IncrementalSearchCV(model, params, n_initial_parameters="grid")
        yield search.fit(X, y, classes=[0, 1])

        assert {frozenset(d["params"].items()) for d in search.history_} == {
            frozenset(d.items()) for d in ParameterGrid(params)
        }
예제 #12
0
def test_numpy_array(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    X, y = yield c.compute([X, y])
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {
        "alpha": np.logspace(-5, -3, 10),
        "l1_ratio": np.linspace(0, 1, 20),
    }

    search = IncrementalSearchCV(model, params, n_initial_parameters=10, max_iter=10)
    yield search.fit(X, y, classes=[0, 1])

    # smoke test to ensure search completed successfully
    assert search.best_score_ > 0
예제 #13
0
async def test_search_plateau_tol(c, s, a, b):
    model = LinearFunction(slope=1)
    params = {"foo": np.linspace(0, 1)}

    # every 3 calls, score will increase by 3. tol=1: model did improved enough
    search = IncrementalSearchCV(model, params, patience=3, tol=1, max_iter=10)
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    await search.fit(X, y)
    assert set(search.cv_results_["partial_fit_calls"]) == {10}

    # Every 3 calls, score increases by 3. tol=4: model didn't improve enough
    search = IncrementalSearchCV(model, params, patience=3, tol=4, max_iter=10)
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    await search.fit(X, y)
    assert set(search.cv_results_["partial_fit_calls"]) == {3}
예제 #14
0
async def test_warns_decay_rate(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    kwargs = dict(max_iter=5, n_initial_parameters=5)
    search = IncrementalSearchCV(model, params, **kwargs)
    match = r"deprecated since Dask-ML v1.4.0."
    with pytest.warns(FutureWarning, match=match):
        await search.fit(X, y)

    # Make sure the printed warning message works
    search = IncrementalSearchCV(model, params, decay_rate=None, **kwargs)
    await search.fit(X, y)
예제 #15
0
def test_warns_decay_rate_wanted(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(
        model, params, max_iter=5, n_initial_parameters=5, decay_rate=1
    )
    match = "decay_rate is deprecated .* Use InverseDecaySearchCV"
    with pytest.warns(FutureWarning, match=match):
        yield search.fit(X, y)

    # Make sure old behavior is retained w/o warning
    search = InverseDecaySearchCV(model, params, decay_rate=1)
    yield search.fit(X, y)
예제 #16
0
def test_search_patience_infeasible_tol(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

    rng = check_random_state(42)
    params = {"value": rng.rand(1000)}
    model = ConstantFunction()

    max_iter = 10
    score_increase = -10
    search = IncrementalSearchCV(
        model, params, max_iter=max_iter, patience=3, tol=score_increase,
    )
    yield search.fit(X, y, classes=[0, 1])

    hist = pd.DataFrame(search.history_)
    assert hist.partial_fit_calls.max() == max_iter
예제 #17
0
def test_search_plateau_patience(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

    class ConstantClassifier(SGDClassifier):
        def __init__(self, value=0):
            self.value = value
            super(ConstantClassifier, self).__init__(tol=1e-3)

        def score(self, *args, **kwargs):
            return self.value

    params = {"value": np.random.rand(10)}
    model = ConstantClassifier()

    search = IncrementalSearchCV(model,
                                 params,
                                 n_initial_parameters=10,
                                 patience=5,
                                 tol=0,
                                 max_iter=10)
    yield search.fit(X, y, classes=[0, 1])

    assert search.history_
    for h in search.history_:
        assert h["partial_fit_calls"] <= 5
    assert isinstance(search.best_estimator_, SGDClassifier)
    assert search.best_score_ == params["value"].max(
    ) == search.best_estimator_.value
    assert "visualize" not in search.__dict__
    assert search.best_score_ > 0

    X_test, y_test = yield c.compute([X, y])

    search.predict(X_test)
    search.score(X_test, y_test)
예제 #18
0
async def test_search_invalid_patience(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(model, params, patience=1, max_iter=10)
    with pytest.raises(ValueError, match="patience >= 2"):
        await search.fit(X, y, classes=[0, 1])

    search = IncrementalSearchCV(model, params, patience=2.0, max_iter=10)
    with pytest.raises(ValueError, match="patience must be an integer"):
        await search.fit(X, y, classes=[0, 1])

    # Make sure this passes
    search = IncrementalSearchCV(model, params, patience=False, max_iter=10)
    await search.fit(X, y, classes=[0, 1])
    assert search.history_
예제 #19
0
async def test_verbosity_types(c, s, a, b):
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}

    for verbose in [-1.0, 1.2]:
        search = IncrementalSearchCV(model,
                                     params,
                                     verbose=verbose,
                                     max_iter=3)
        with pytest.raises(ValueError, match="0 <= verbose <= 1"):
            await search.fit(X, y)

    for verbose in [0.0, 0, 1, 1.0, True, False]:
        search = IncrementalSearchCV(model,
                                     params,
                                     verbose=verbose,
                                     max_iter=3)
        await search.fit(X, y)
예제 #20
0
async def test_search_basic_patience(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

    rng = check_random_state(42)
    params = {"slope": 2 + rng.rand(1000)}
    model = LinearFunction()

    # Test the case where tol to small (all models finish)
    max_iter = 15
    patience = 5
    increase_after_patience = patience
    search = IncrementalSearchCV(
        model,
        params,
        max_iter=max_iter,
        tol=increase_after_patience,
        patience=patience,
        fits_per_score=3,
    )
    await search.fit(X, y, classes=[0, 1])

    hist = pd.DataFrame(search.history_)
    # +1 (and +2 below) because scores_per_fit isn't exact
    assert hist.partial_fit_calls.max() == max_iter + 1

    # Test the case where tol to large (no models finish)
    patience = 5
    increase_after_patience = patience
    params = {"slope": 0 + 0.9 * rng.rand(1000)}
    search = IncrementalSearchCV(
        model,
        params,
        max_iter=max_iter,
        tol=increase_after_patience,
        patience=patience,
        fits_per_score=3,
    )
    await search.fit(X, y, classes=[0, 1])

    hist = pd.DataFrame(search.history_)
    assert hist.partial_fit_calls.max() == patience + 2
예제 #21
0
async def test_model_future(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()
    model_future = await c.scatter(model)

    search = IncrementalSearchCV(model_future, params, max_iter=10)

    await search.fit(X, y, classes=[0, 1])
    assert search.history_
    assert search.best_score_ > 0
예제 #22
0
def test_same_models_with_random_state(c, s, a, b):
    X, y = make_classification(n_samples=100,
                               n_features=2,
                               chunks=(10, 5),
                               random_state=0)
    model = Incremental(
        SGDClassifier(tol=-np.inf,
                      penalty="elasticnet",
                      random_state=42,
                      eta0=0.1))
    params = {
        "loss":
        ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
        "average": [True, False],
        "learning_rate": ["constant", "invscaling", "optimal"],
        "eta0": np.logspace(-2, 0, num=1000),
    }
    params = {"estimator__" + k: v for k, v in params.items()}
    search1 = IncrementalSearchCV(clone(model),
                                  params,
                                  n_initial_parameters=10,
                                  random_state=0)
    search2 = IncrementalSearchCV(clone(model),
                                  params,
                                  n_initial_parameters=10,
                                  random_state=0)

    yield search1.fit(X, y, classes=[0, 1])
    yield search2.fit(X, y, classes=[0, 1])

    assert search1.best_score_ == search2.best_score_
    assert search1.best_params_ == search2.best_params_
    assert np.allclose(search1.best_estimator_.coef_,
                       search2.best_estimator_.coef_)
예제 #23
0
def test_high_performing_models_are_retained_with_patience(c, s, a, b):
    """
    This tests covers a case when high performing models plateau before the
    search is finished.

    This covers the use case when one poor-performing model takes a long time
    to converge, but all other high-performing models have finished (and
    plateaued).

    Details
    -------
    This test defines

    * low performing models that continue to improve
    * high performing models that are constant

    It uses a small tolerance to stop the constant (and high-performing) models.

    This test is only concerned with making sure the high-performing model is
    retained after it has reached a plateau. It is not concerned with making
    sure models are killed off at correct times.
    """

    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

    params = {"final_score": [1, 2, 3, 4, 5]}
    search = IncrementalSearchCV(
        _MaybeLinearFunction(),
        params,
        patience=2,
        tol=1e-3,  # only stop the constant functions
        decay_rate=0,
        n_initial_parameters="grid",
        max_iter=20,
    )

    search._adapt = _remove_worst_performing_model
    yield search.fit(X, y)
    assert search.best_params_ == {"final_score": 5}
예제 #24
0
def test_same_random_state_same_params(c, s, a, b):
    # This makes sure parameters are sampled correctly when random state is
    # specified.

    # This test makes sure random state is *correctly* passed to successive
    # halvings from Hyperband
    seed = 0
    values = scipy.stats.uniform(0, 1)
    h = HyperbandSearchCV(ConstantFunction(), {"value": values},
                          random_state=seed,
                          max_iter=9)

    # Make a class for passive random sampling
    passive = IncrementalSearchCV(
        ConstantFunction(),
        {"value": values},
        random_state=seed,
        max_iter=2,
        n_initial_parameters=h.metadata["n_models"],
    )
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    yield h.fit(X, y)
    yield passive.fit(X, y)

    # Check to make sure the Hyperbands found the same params
    v_h = h.cv_results_["param_value"]

    # Check to make sure the random passive search had *some* of the same params
    v_passive = passive.cv_results_["param_value"]
    # Sanity checks to make sure all unique floats
    assert len(set(v_passive)) == len(v_passive)
    assert len(set(v_h)) == len(v_h)

    # Getting the `value`s that are the same for both searches
    same = set(v_passive).intersection(set(v_h))

    passive_models = h.metadata["brackets"][0]["n_models"]
    assert len(same) == passive_models
예제 #25
0
def test_same_params_with_random_state(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {"alpha": scipy.stats.uniform(1e-4, 1)}

    search1 = IncrementalSearchCV(
        model, params, n_initial_parameters=10, random_state=0
    )
    yield search1.fit(X, y, classes=[0, 1])
    params1 = search1.cv_results_["param_alpha"]

    search2 = IncrementalSearchCV(
        model, params, n_initial_parameters=10, random_state=0
    )
    yield search2.fit(X, y, classes=[0, 1])
    params2 = search2.cv_results_["param_alpha"]

    assert np.allclose(params1, params2)
예제 #26
0
def test_search_plateau_tol(c, s, a, b):
    class LinearFunction(BaseEstimator):
        def __init__(self, intercept=0, slope=1, foo=0):
            self._num_calls = 0
            self.intercept = intercept
            self.slope = slope
            super(LinearFunction, self).__init__()

        def fit(self, *args):
            return self

        def partial_fit(self, *args, **kwargs):
            self._num_calls += 1
            return self

        def score(self, *args, **kwargs):
            return self.intercept + self.slope * self._num_calls

    model = LinearFunction(slope=1)
    params = {"foo": np.linspace(0, 1)}

    # every 3 calls, score will increase by 3. tol=1: model did improved enough
    search = IncrementalSearchCV(model,
                                 params,
                                 patience=3,
                                 tol=1,
                                 max_iter=10,
                                 decay_rate=0)
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    yield search.fit(X, y)
    assert set(search.cv_results_["partial_fit_calls"]) == {10}

    # Every 3 calls, score increases by 3. tol=4: model didn't improve enough
    search = IncrementalSearchCV(model,
                                 params,
                                 patience=3,
                                 tol=4,
                                 decay_rate=0,
                                 max_iter=10)
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    yield search.fit(X, y)
    assert set(search.cv_results_["partial_fit_calls"]) == {3}
예제 #27
0
def test_history(c, s, a, b):
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = IncrementalSearchCV(model, params, max_iter=9, random_state=42)
    yield alg.fit(X, y)
    gt_zero = lambda x: x >= 0
    gt_one = lambda x: x >= 1

    key_types_and_checks = [
        ("mean_partial_fit_time", float, gt_zero),
        ("mean_score_time", float, gt_zero),
        ("std_partial_fit_time", float, gt_zero),
        ("std_score_time", float, gt_zero),
        ("test_score", float, gt_zero),
        ("rank_test_score", int, gt_one),
        ("model_id", int, None),
        ("partial_fit_calls", int, gt_zero),
        ("params", dict, lambda d: set(d.keys()) == {"value"}),
        ("param_value", float, gt_zero),
    ]
    assert set(alg.cv_results_) == {v[0] for v in key_types_and_checks}
    for column, dtype, condition in key_types_and_checks:
        if dtype:
            assert alg.cv_results_[column].dtype == dtype
        if condition:
            assert all(condition(x) for x in alg.cv_results_[column])

    alg.best_estimator_.fit(X, y)
    alg.best_estimator_.score(X, y)
    alg.score(X, y)

    # Test types/format of all parameters we set after fitting
    assert isinstance(alg.best_index_, int)
    assert isinstance(alg.best_estimator_, ConstantFunction)
    assert isinstance(alg.best_score_, float)
    assert isinstance(alg.best_params_, dict)
    assert isinstance(alg.history_, list)
    assert all(isinstance(h, dict) for h in alg.history_)
    assert isinstance(alg.model_history_, dict)
    assert all(vi in alg.history_ for v in alg.model_history_.values()
               for vi in v)
    assert all(isinstance(v, np.ndarray) for v in alg.cv_results_.values())
    assert isinstance(alg.multimetric_, bool)

    keys = {
        "score",
        "score_time",
        "partial_fit_calls",
        "partial_fit_time",
        "model_id",
        "elapsed_wall_time",
        "params",
    }
    assert all(set(h.keys()) == keys for h in alg.history_)
    times = [v["elapsed_wall_time"] for v in alg.history_]
    assert (np.diff(times) >= 0).all()

    # Test to make sure history_ ordered with wall time
    assert (np.diff([v["elapsed_wall_time"] for v in alg.history_]) >= 0).all()
    for model_hist in alg.model_history_.values():
        calls = [h["partial_fit_calls"] for h in model_hist]
        assert (np.diff(calls) >= 1).all() or len(calls) == 1
예제 #28
0
def test_high_performing_models_are_retained_with_patience(c, s, a, b):
    """
    This tests covers a case when high performing models plateau before the
    search is finished.

    This covers the use case when one poor-performing model takes a long time
    to converge, but all other high-performing models have finished (and
    plateaued).

    Details
    -------
    This test defines

    * low performing models that continue to improve
    * high performing models that are constant

    It uses a small tolerance to stop the constant (and high-performing) models.

    This test is only concerned with making sure the high-performing model is
    retained after it has reached a plateau. It is not concerned with making
    sure models are killed off at correct times.
    """
    class MaybeLinearFunction(BaseEstimator):
        def __init__(self, final_score=1):
            self.final_score = final_score
            self._calls = 0

        def fit(self, X, y):
            return self

        def partial_fit(self, X, y):
            self._calls += 1

        def score(self, X, y):
            if self.final_score <= 3:
                return self.final_score * (1 - 1 / (self._calls + 2))
            return self.final_score

    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

    params = {"final_score": [1, 2, 3, 4, 5]}
    search = IncrementalSearchCV(
        MaybeLinearFunction(),
        params,
        patience=2,
        tol=1e-3,  # only stop the constant functions
        decay_rate=0,
        n_initial_parameters="grid",
        max_iter=20,
    )

    def remove_worst_performing_model(info):
        calls = {v[-1]["partial_fit_calls"] for v in info.values()}
        ests = {v[-1]["params"]["final_score"] for v in info.values()}

        if max(calls) == 1:
            assert all(x in ests for x in [1, 2, 3, 4, 5])
        elif max(calls) == 2:
            assert all(x in ests for x in [2, 3, 4, 5])
            assert all(x not in ests for x in [1])
        elif max(calls) == 3:
            assert all(x in ests for x in [3, 4, 5])
            assert all(x not in ests for x in [1, 2])
        elif max(calls) == 4:
            assert all(x in ests for x in [4, 5])
            assert all(x not in ests for x in [1, 2, 3])
        elif max(calls) == 5:
            assert all(x in ests for x in [5])
            assert all(x not in ests for x in [1, 2, 3, 4])
            return {k: 0 for k in info.keys()}

        recent_scores = {
            k: v[-1]["score"]
            for k, v in info.items()
            if v[-1]["partial_fit_calls"] == max(calls)
        }
        return {
            k: 1
            for k, v in recent_scores.items()
            if v > min(recent_scores.values())
        }

    search._adapt = remove_worst_performing_model
    yield search.fit(X, y)
    assert search.best_params_ == {"final_score": 5}
예제 #29
0
def _test_search_basic(decay_rate, c, s, a, b):
    X, y = make_classification(n_samples=1000, n_features=5, chunks=(100, 5))
    model = SGDClassifier(tol=1e-3, loss="log", penalty="elasticnet")

    params = {
        "alpha": np.logspace(-2, 2, 100),
        "l1_ratio": np.linspace(0.01, 1, 200)
    }

    search = IncrementalSearchCV(model,
                                 params,
                                 n_initial_parameters=20,
                                 max_iter=10,
                                 decay_rate=decay_rate)
    yield search.fit(X, y, classes=[0, 1])

    assert search.history_
    for d in search.history_:
        assert d["partial_fit_calls"] <= search.max_iter + 1
    assert isinstance(search.best_estimator_, SGDClassifier)
    assert search.best_score_ > 0
    assert "visualize" not in search.__dict__
    assert search.best_params_
    assert search.cv_results_ and isinstance(search.cv_results_, dict)
    assert {
        "mean_partial_fit_time",
        "mean_score_time",
        "std_partial_fit_time",
        "std_score_time",
        "test_score",
        "rank_test_score",
        "model_id",
        "params",
        "partial_fit_calls",
        "param_alpha",
        "param_l1_ratio",
    }.issubset(set(search.cv_results_.keys()))
    assert len(search.cv_results_["param_alpha"]) == 20

    assert all(isinstance(v, np.ndarray) for v in search.cv_results_.values())
    if decay_rate == 0:
        assert (search.cv_results_["test_score"][search.best_index_] >=
                search.cv_results_["test_score"]).all()
        assert search.cv_results_["rank_test_score"][search.best_index_] == 1
    else:
        assert all(search.cv_results_["test_score"] >= 0)
        assert all(search.cv_results_["rank_test_score"] >= 1)
    assert all(search.cv_results_["partial_fit_calls"] >= 1)
    assert len(np.unique(search.cv_results_["model_id"])) == len(
        search.cv_results_["model_id"])
    assert sorted(search.model_history_.keys()) == list(range(20))
    assert set(search.model_history_[0][0].keys()) == {
        "model_id",
        "params",
        "partial_fit_calls",
        "partial_fit_time",
        "score",
        "score_time",
        "elapsed_wall_time",
    }

    X_, = yield c.compute([X])
    # Dask Objects are lazy

    proba = search.predict_proba(X)
    log_proba = search.predict_log_proba(X)
    assert proba.shape == (1000, 2)
    assert log_proba.shape == (1000, 2)

    assert isinstance(proba, da.Array)
    assert isinstance(log_proba, da.Array)

    proba_ = search.predict_proba(X_)
    log_proba_ = search.predict_log_proba(X_)

    da.utils.assert_eq(proba, proba_)
    da.utils.assert_eq(log_proba, log_proba_)

    decision = search.decision_function(X_)
    assert decision.shape == (1000, )
예제 #30
0
def run():
    client = Client()
    from dask_ml.datasets import make_classification
    df = dd.read_csv("isHealth.csv",
                     assume_missing=True,
                     sample=640000000,
                     blocksize="10MB")
    df = df.fillna(0).fillna(0)
    for column in df.columns:
        if '.' in column:
            df = df.drop(column, axis=1)
    # for column in droppedColumns:
    #     df = df.drop(column, axis=1)
    y = df['acquired']
    X = df.drop('acquired', axis=1)
    from dask_ml.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
    # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train)
    x_test_tickers = X_test['ticker'].values.compute()
    x_test_dates = X_test['date'].values.compute()
    print(x_test_tickers[0])
    np.savetxt("x_test_tickers.csv", [x_test_tickers, x_test_dates],
               delimiter=",",
               fmt='%s')
    np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s')
    print("GOOD")
    for column in X_train.columns:
        if 'ticker' in column or 'date' in column:
            X_train = X_train.drop(column, axis=1)
            X_test = X_test.drop(column, axis=1)
    X_train = X_train.to_dask_array()
    X_test = X_test.values.compute()
    y_train = y_train.to_dask_array()
    y_test = y_test.values.compute()
    np.savetxt("y_test.csv", y_test, delimiter=",")
    from dask_ml.wrappers import Incremental
    from sklearn.linear_model import SGDClassifier
    from sklearn.neural_network import MLPClassifier
    from dask_ml.wrappers import ParallelPostFit

    est = MLPClassifier(solver='adam', activation='relu', random_state=0)
    inc = Incremental(est, scoring='neg_log_loss')
    print("WORKING")
    for _ in range(10):
        inc.partial_fit(X_train, y_train, classes=[0, 1])
        print("FITTED")
        np.savetxt("predictions.csv", inc.predict_proba(X_test))
        print('Score:', inc.score(X_test, y_test))

    # model = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,2),random_state=1)
    params = {'alpha': np.logspace(-2, 1, num=1000)}
    from dask_ml.model_selection import IncrementalSearchCV
    search = IncrementalSearchCV(est,
                                 params,
                                 n_initial_parameters=100,
                                 patience=20,
                                 max_iter=100)
    search.fit(X_train, y_train, classes=[0, 1])
    print(search)
    print("SCORE")
    print("FITTED")
    np.savetxt("predictions.csv", inc.predict_proba(X_test))
    print('Score:', inc.score(X_test, y_test))