Пример #1
0
def test_set_params():
    clf = Incremental(SGDClassifier())
    clf.set_params(**{'scoring': 'accuracy', 'estimator__max_iter': 20})
    result = clf.get_params()

    assert result['estimator__max_iter'] == 20
    assert result['scoring'] == 'accuracy'
Пример #2
0
def test_set_params():
    clf = Incremental(SGDClassifier())
    clf.set_params(**{"scoring": "accuracy", "estimator__max_iter": 20})
    result = clf.get_params()

    assert result["estimator__max_iter"] == 20
    assert result["scoring"] == "accuracy"
Пример #3
0
def test_scoring(scheduler, xy_classification,
                 scoring=dask_ml.metrics.accuracy_score):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring)
        with pytest.raises(ValueError,
                           match='metric function rather than a scorer'):
            clf.fit(X, y, classes=np.unique(y))
Пример #4
0
def test_fit_rechunking():
    n_classes = 2
    X, y = make_classification(chunks=20, n_classes=n_classes)
    X = X.rechunk({1: 10})

    assert X.numblocks[1] > 1

    clf = Incremental(SGDClassifier(max_iter=5, tol=1e-3))
    clf.fit(X, y, classes=list(range(n_classes)))
Пример #5
0
def test_incremental_basic(scheduler, xy_classification):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        est1 = SGDClassifier(random_state=0, tol=1e-3)
        est2 = clone(est1)

        clf = Incremental(est1)
        result = clf.fit(X, y, classes=[0, 1])
        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])

        assert result is clf

        assert isinstance(result.estimator.coef_, np.ndarray)
        np.testing.assert_array_almost_equal(result.estimator.coef_,
                                             est2.coef_)

        assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        assert_eq(result, expected)

        # score
        result = clf.score(X, y)
        expected = est2.score(X, y)
        # assert isinstance(result, da.Array)
        assert_eq(result, expected)

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
        clf.partial_fit(X, y, classes=[0, 1])
        assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])
Пример #6
0
def test_fit_ndarrays():
    X = np.ones((10, 5))
    y = np.concatenate([np.zeros(5), np.ones(5)])

    sgd = SGDClassifier(tol=1e-3)
    inc = Incremental(sgd)

    inc.partial_fit(X, y, classes=[0, 1])
    sgd.fit(X, y)

    assert inc.estimator is sgd
    assert_eq(inc.coef_, inc.estimator_.coef_)
Пример #7
0
def test_estimator_param_raises():
    class Dummy(sklearn.base.BaseEstimator):
        def __init__(self, estimator=42):
            self.estimator = estimator

        def fit(self, X):
            return self

    clf = Incremental(Dummy(estimator=1))

    with pytest.raises(ValueError, match='used by both'):
        clf.get_params()
Пример #8
0
def test_incremental_basic(scheduler, dataframes):
    # Create observations that we know linear models can recover
    n, d = 100, 3
    rng = da.random.RandomState(42)
    X = rng.normal(size=(n, d), chunks=30)
    coef_star = rng.uniform(size=d, chunks=d)
    y = da.sign(X.dot(coef_star))
    y = (y + 1) / 2
    if dataframes:
        X = dd.from_array(X)
        y = dd.from_array(y)

    with scheduler() as (s, [_, _]):
        est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
        est2 = clone(est1)

        clf = Incremental(est1, random_state=0)
        result = clf.fit(X, y, classes=[0, 1])
        assert result is clf

        # est2 is a sklearn optimizer; this is just a benchmark
        if dataframes:
            X = X.to_dask_array(lengths=True)
            y = y.to_dask_array(lengths=True)

        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_].compute(),
                             y[slice_[0]].compute(),
                             classes=[0, 1])

        assert isinstance(result.estimator_.coef_, np.ndarray)
        rel_error = np.linalg.norm(clf.coef_ - est2.coef_)
        rel_error /= np.linalg.norm(clf.coef_)
        assert rel_error < 0.9

        assert set(dir(clf.estimator_)) == set(dir(est2))

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        if dataframes:
            # Compute is needed because chunk sizes of this array are unknown
            result = result.compute()
        rel_error = np.linalg.norm(result - expected)
        rel_error /= np.linalg.norm(expected)
        assert rel_error < 0.3

        # score
        result = clf.score(X, y)
        expected = est2.score(*dask.compute(X, y))
        assert abs(result - expected) < 0.1

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3,
                                        average=True))
        clf.partial_fit(X, y, classes=[0, 1])
        assert set(dir(clf.estimator_)) == set(dir(est2))
Пример #9
0
def test_score(xy_classification):
    distributed = pytest.importorskip("distributed")
    client = distributed.Client(n_workers=2)

    X, y = xy_classification
    inc = Incremental(SGDClassifier(max_iter=1000, random_state=0), scoring="accuracy")

    with client:
        inc.fit(X, y, classes=[0, 1])
        result = inc.score(X, y)
        expected = inc.estimator_.score(X, y)

    assert result == expected
Пример #10
0
def test_in_gridsearch(scheduler, xy_classification):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
        param_grid = {'alpha': [0.1, 10]}
        gs = sklearn.model_selection.GridSearchCV(clf, param_grid, iid=False)
        gs.fit(X, y, classes=[0, 1])
Пример #11
0
def test_same_models_with_random_state(c, s, a, b):
    X, y = make_classification(n_samples=100,
                               n_features=2,
                               chunks=(10, 5),
                               random_state=0)
    model = Incremental(
        SGDClassifier(tol=-np.inf,
                      penalty="elasticnet",
                      random_state=42,
                      eta0=0.1))
    params = {
        "loss":
        ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
        "average": [True, False],
        "learning_rate": ["constant", "invscaling", "optimal"],
        "eta0": np.logspace(-2, 0, num=1000),
    }
    params = {"estimator__" + k: v for k, v in params.items()}
    search1 = IncrementalSearchCV(clone(model),
                                  params,
                                  n_initial_parameters=10,
                                  random_state=0)
    search2 = IncrementalSearchCV(clone(model),
                                  params,
                                  n_initial_parameters=10,
                                  random_state=0)

    yield search1.fit(X, y, classes=[0, 1])
    yield search2.fit(X, y, classes=[0, 1])

    assert search1.best_score_ == search2.best_score_
    assert search1.best_params_ == search2.best_params_
    assert np.allclose(search1.best_estimator_.coef_,
                       search2.best_estimator_.coef_)
Пример #12
0
def test_incremental_text_pipeline(container):
    X = pd.Series(["a list", "of words", "for classification"] * 100)
    X = dd.from_pandas(X, npartitions=3)

    if container == "bag":
        X = X.to_bag()

    y = da.from_array(np.array([0, 0, 1] * 100), chunks=(100,) * 3)

    assert tuple(X.map_partitions(len).compute()) == y.chunks[0]

    sgd = SGDClassifier(max_iter=5, tol=1e-3)
    clf = Incremental(sgd, scoring="accuracy", assume_equal_chunks=True)
    vect = dask_ml.feature_extraction.text.HashingVectorizer()
    pipe = make_pipeline(vect, clf)

    pipe.fit(X, y, incremental__classes=[0, 1])
    X2 = pipe.steps[0][1].transform(X)
    assert hasattr(clf, "coef_")

    X2.compute_chunk_sizes()
    assert X2.shape == (300, vect.n_features)

    preds = pipe.predict(X).compute()
    assert len(y) == len(preds)
Пример #13
0
def run_on_blobs():
    x, y = dask_ml.datasets.make_blobs(n_samples=1e8,
                                       chunks=1e5,
                                       random_state=0,
                                       centers=3)

    x = dd.dataframe.from_array(x)
    y = dd.dataframe.from_array(y)

    print(f"Rows: {x.shape[0].compute()}")

    ests_per_chunk = 4
    chunks = len(x.divisions)

    srfc = Incremental(StreamingRFC(n_estimators_per_chunk=ests_per_chunk,
                                    max_n_estimators=np.inf,
                                    verbose=1,
                                    n_jobs=4))
    srfc.fit(x, y,
             classes=y.unique().compute())
Пример #14
0
def test_in_gridsearch(scheduler, xy_classification):
    X, y = xy_classification
    clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
    param_grid = {"estimator__alpha": [0.1, 10]}
    if SK_022:
        kwargs = {}
    else:
        kwargs = {"iid": False}
    gs = sklearn.model_selection.GridSearchCV(clf, param_grid, cv=3, **kwargs)

    with scheduler() as (s, [a, b]):
        gs.fit(X, y, classes=[0, 1])
Пример #15
0
def test_scoring_string(scheduler, xy_classification, scoring):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring)
        assert callable(check_scoring(clf, scoring=scoring))
        clf.fit(X, y, classes=np.unique(y))
        clf.score(X, y)
Пример #16
0
def test_incremental_basic(scheduler):
    # Create observations that we know linear models can recover
    n, d = 100, 3
    rng = da.random.RandomState(42)
    X = rng.normal(size=(n, d), chunks=30)
    coef_star = rng.uniform(size=d, chunks=d)
    y = da.sign(X.dot(coef_star))
    y = (y + 1) / 2

    with scheduler() as (s, [_, _]):
        est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
        est2 = clone(est1)

        clf = Incremental(est1, random_state=0)
        result = clf.fit(X, y, classes=[0, 1])
        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])

        assert result is clf

        assert isinstance(result.estimator_.coef_, np.ndarray)
        rel_error = np.linalg.norm(clf.coef_ - est2.coef_)
        rel_error /= np.linalg.norm(clf.coef_)
        assert rel_error < 0.9

        assert set(dir(clf.estimator_)) == set(dir(est2))

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        rel_error = np.linalg.norm(result - expected)
        rel_error /= np.linalg.norm(expected)
        assert rel_error < 0.2

        # score
        result = clf.score(X, y)
        expected = est2.score(X, y)
        assert abs(result - expected) < 0.1

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3,
                                        average=True))
        clf.partial_fit(X, y, classes=[0, 1])
        assert set(dir(clf.estimator_)) == set(dir(est2))
Пример #17
0
def test_score_ndarrays():
    X = np.ones((10, 5))
    y = np.ones(10)

    sgd = SGDClassifier(tol=1e-3)
    inc = Incremental(sgd, scoring="accuracy")

    inc.partial_fit(X, y, classes=[0, 1])
    inc.fit(X, y, classes=[0, 1])

    assert inc.score(X, y) == 1

    dX = da.from_array(X, chunks=(2, 5))
    dy = da.from_array(y, chunks=2)
    assert inc.score(dX, dy) == 1
Пример #18
0
    def setUpClass(cls):
        """Set up model to test."""
        cls = cls._prep_data(cls)
        cls.mod = Incremental(
            StreamingRFC(n_estimators_per_chunk=20,
                         n_jobs=-1,
                         max_n_estimators=np.inf,
                         verbose=1))

        # Set expected number of estimators
        cls.expected_n_estimators = 200

        # Set helper values
        super().setUpClass()
Пример #19
0
    def setUpClass(cls):
        """Set up model to test."""
        cls = cls._prep_data(cls)
        cls.mod = Incremental(
            StreamingRFC(n_estimators_per_chunk=1,
                         max_n_estimators=39,
                         verbose=1))

        # Set expected number of estimators
        # This should be set manually depending on data.
        cls.expected_n_estimators = 10

        # Set helper values
        super().setUpClass()
Пример #20
0
def test_scoring_string(scheduler, xy_classification, scoring):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(), scoring=scoring)
        if scoring:
            make_scorer(clf.scoring) == dask_ml.metrics.scorer.SCORERS[scoring]
        clf.fit(X, y, classes=np.unique(y))
        clf.score(X, y)
        clf.estimator.score(X, y)
Пример #21
0
def run3():
    client = Client()
    from dask_ml.datasets import make_classification
    df = dd.read_csv("isHealthTrain.csv",
                     assume_missing=True,
                     sample=640000000,
                     blocksize="10MB")
    df = df.fillna(0).fillna(0)
    for column in df.columns:
        if '.' in column:
            df = df.drop(column, axis=1)
    # for column in droppedColumns:
    #     df = df.drop(column, axis=1)
    y_train = df['acquired']
    X_train = df.drop('acquired', axis=1)
    df2 = dd.read_csv("isHealthTest.csv",
                      assume_missing=True,
                      sample=640000000,
                      blocksize="10MB")
    df2 = df2.fillna(0).fillna(0)
    for column in df2.columns:
        if '.' in column:
            df2 = df2.drop(column, axis=1)
    # for column in droppedColumns:
    #     df = df.drop(column, axis=1)
    y_test = df2['acquired']
    X_test = df2.drop('acquired', axis=1)
    # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train)
    x_test_tickers = X_test['ticker'].values.compute()
    x_test_dates = X_test['date'].values.compute()
    print(x_test_tickers[0])
    np.savetxt("x_test_tickers.csv", x_test_tickers, delimiter=",", fmt='%s')
    np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s')
    print("GOOD")
    for column in X_train.columns:
        if 'ticker' in column or 'date' in column:
            X_train = X_train.drop(column, axis=1)
            X_test = X_test.drop(column, axis=1)
    X_train = X_train.to_dask_array()
    X_test = X_test.values.compute()
    y_train = y_train.to_dask_array()
    y_test = y_test.values.compute()
    np.savetxt("y_test.csv", y_test, delimiter=",")
    from dask_ml.wrappers import Incremental
    from sklearn.linear_model import SGDClassifier
    from sklearn.neural_network import MLPClassifier
    from dask_ml.wrappers import ParallelPostFit

    est = MLPClassifier(solver='adam', activation='relu', random_state=0)
    print(est)
    inc = Incremental(est, scoring='f1')
    print("WORKING")
    for _ in range(10):
        inc.partial_fit(X_train, y_train, classes=[0, 1])
        print("FITTED")
        np.savetxt("predictions.csv", inc.predict_proba(X_test))
        print('Score:', inc.score(X_test, y_test))
Пример #22
0
    def setUpClass(cls):
        """Set up model to test."""
        cls = cls._prep_data(cls, reg=True)
        cls.mod = Incremental(
            StreamingRFR(n_estimators_per_chunk=1,
                         n_jobs=-1,
                         max_n_estimators=np.inf,
                         max_features=cls.x.shape[1],
                         verbose=1))

        # Set expected number of estimators
        cls.expected_n_estimators = 10

        # Set helper values
        super().setUpClass()
Пример #23
0
def test_scoring_string(scheduler, xy_classification, scoring):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring)
        if scoring:
            assert (dask_ml.metrics.scorer.SCORERS[scoring] == check_scoring(
                clf, scoring=scoring))
        assert callable(check_scoring(clf, scoring=scoring))
        clf.fit(X, y, classes=np.unique(y))
        clf.score(X, y)
Пример #24
0
def test_replace_scoring(estimator, fit_kwargs, scoring, xy_classification, mocker):
    X, y = xy_classification
    inc = Incremental(estimator(max_iter=1000, random_state=0, tol=1e-3))
    inc.fit(X, y, **fit_kwargs)

    patch = mocker.patch.object(dask_ml.wrappers, "get_scorer")
    with patch:
        inc.score(X, y)

    assert patch.call_count == 1
    patch.assert_called_with(scoring, compute=True)
Пример #25
0
def run():
    client = Client()
    from dask_ml.datasets import make_classification
    df = dd.read_csv("isHealth.csv",
                     assume_missing=True,
                     sample=640000000,
                     blocksize="10MB")
    df = df.fillna(0).fillna(0)
    for column in df.columns:
        if '.' in column:
            df = df.drop(column, axis=1)
    # for column in droppedColumns:
    #     df = df.drop(column, axis=1)
    y = df['acquired']
    X = df.drop('acquired', axis=1)
    from dask_ml.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
    # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train)
    x_test_tickers = X_test['ticker'].values.compute()
    x_test_dates = X_test['date'].values.compute()
    print(x_test_tickers[0])
    np.savetxt("x_test_tickers.csv", [x_test_tickers, x_test_dates],
               delimiter=",",
               fmt='%s')
    np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s')
    print("GOOD")
    for column in X_train.columns:
        if 'ticker' in column or 'date' in column:
            X_train = X_train.drop(column, axis=1)
            X_test = X_test.drop(column, axis=1)
    X_train = X_train.to_dask_array()
    X_test = X_test.values.compute()
    y_train = y_train.to_dask_array()
    y_test = y_test.values.compute()
    np.savetxt("y_test.csv", y_test, delimiter=",")
    from dask_ml.wrappers import Incremental
    from sklearn.linear_model import SGDClassifier
    from sklearn.neural_network import MLPClassifier
    from dask_ml.wrappers import ParallelPostFit

    est = MLPClassifier(solver='adam', activation='relu', random_state=0)
    inc = Incremental(est, scoring='neg_log_loss')
    print("WORKING")
    for _ in range(10):
        inc.partial_fit(X_train, y_train, classes=[0, 1])
        print("FITTED")
        np.savetxt("predictions.csv", inc.predict_proba(X_test))
        print('Score:', inc.score(X_test, y_test))

    # model = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,2),random_state=1)
    params = {'alpha': np.logspace(-2, 1, num=1000)}
    from dask_ml.model_selection import IncrementalSearchCV
    search = IncrementalSearchCV(est,
                                 params,
                                 n_initial_parameters=100,
                                 patience=20,
                                 max_iter=100)
    search.fit(X_train, y_train, classes=[0, 1])
    print(search)
    print("SCORE")
    print("FITTED")
    np.savetxt("predictions.csv", inc.predict_proba(X_test))
    print('Score:', inc.score(X_test, y_test))
with ProgressBar():
    lr.fit(X_train, y_train)

print('Logistic Regression Score : ', lr.score(X_test, y_test).compute())
##### OUTPUT --------> Logistic Regression Score :  0.70025

#####################################################################################

# Fitting the Naive Bayes Classifier
from sklearn.naive_bayes import BernoulliNB
from dask_ml.wrappers import Incremental

nb = BernoulliNB()

parallel_nb = Incremental(nb)

with ProgressBar():
    parallel_nb.fit(X_train, y_train, classes=np.unique(y_train.compute()))

print('\n\nNaive Bayes Classifier Score : ', parallel_nb.score(X_test, y_test))
##### OUTPUT --------> Naive Bayes Classifier Score :  0.65

######################################################################################

# Performing GridSearch on the Logistic Regression Classifier
from dask_ml.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]}

lr = LogisticRegression()
Пример #27
0
def test_get_params():
    clf = Incremental(SGDClassifier())
    result = clf.get_params()

    assert "estimator__max_iter" in result
    assert result["scoring"] is None
Пример #28
0
features = FEATURES_ARRAY

# [OUTPUT_FOLDER + 'lbp' + FORMAT]: #
# for feature in features:
for feature in [OUTPUT_FOLDER + 'lbp' + FORMAT]:  # features:
    print("""
    ----------------------------------
    getting feature: {}
    """.format(feature))
    X = np.load(feature, allow_pickle=True)
    X = to_arr(X)
    np.save('lbp_arr', X)
    X = da.from_array(X, chunks=X.shape)

    X = transformer_pipe.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    classes = da.unique(y_train).compute()

    "One model for all"
    inc = Incremental(SVC(random_state=RANDOM_STATE), scoring='accuracy')
    for _ in range(10):
        inc.partial_fit(X_train, y_train, classes=classes)
        print('Score:', inc.score(X_test, y_test))

    score = inc.score(X_test, y_test)
    print(score)

    np.save('lbp_svm', score)

time_it()
Пример #29
0
    def _test_basic(c, s, a, b):
        rng = da.random.RandomState(42)

        n, d = (50, 2)
        # create observations we know linear models can fit
        X = rng.normal(size=(n, d), chunks=n // 2)
        coef_star = rng.uniform(size=d, chunks=d)
        y = da.sign(X.dot(coef_star))

        if array_type == "numpy":
            X, y = yield c.compute((X, y))

        params = {
            "loss":
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            "average": [True, False],
            "learning_rate": ["constant", "invscaling", "optimal"],
            "eta0":
            np.logspace(-2, 0, num=1000),
        }
        model = SGDClassifier(tol=-np.inf,
                              penalty="elasticnet",
                              random_state=42,
                              eta0=0.1)
        if library == "dask-ml":
            model = Incremental(model)
            params = {"estimator__" + k: v for k, v in params.items()}
        elif library == "ConstantFunction":
            model = ConstantFunction()
            params = {"value": np.linspace(0, 1, num=1000)}

        search = HyperbandSearchCV(model,
                                   params,
                                   max_iter=max_iter,
                                   random_state=42)
        classes = c.compute(da.unique(y))
        yield search.fit(X, y, classes=classes)

        if library == "dask-ml":
            X, y = yield c.compute((X, y))
        score = search.best_estimator_.score(X, y)
        assert score == search.score(X, y)
        assert 0 <= score <= 1

        if library == "ConstantFunction":
            assert score == search.best_score_
        else:
            # These are not equal because IncrementalSearchCV uses a train/test
            # split and we're testing on the entire train dataset, not only the
            # validation/test set.
            assert abs(score - search.best_score_) < 0.1

        assert type(search.best_estimator_) == type(model)
        assert isinstance(search.best_params_, dict)

        num_fit_models = len(set(search.cv_results_["model_id"]))
        num_pf_calls = sum([
            v[-1]["partial_fit_calls"] for v in search.model_history_.values()
        ])
        models = {9: 17, 15: 17, 20: 17, 27: 49, 30: 49, 81: 143}
        pf_calls = {9: 69, 15: 101, 20: 144, 27: 357, 30: 379, 81: 1581}
        assert num_fit_models == models[max_iter]
        assert num_pf_calls == pf_calls[max_iter]

        best_idx = search.best_index_
        if isinstance(model, ConstantFunction):
            assert search.cv_results_["test_score"][best_idx] == max(
                search.cv_results_["test_score"])
        model_ids = {h["model_id"] for h in search.history_}

        if math.log(max_iter, 3) % 1.0 == 0:
            # log(max_iter, 3) % 1.0 == 0 is the good case when max_iter is a
            # power of search.aggressiveness
            # In this case, assert that more models are tried then the max_iter
            assert len(model_ids) > max_iter
        else:
            # Otherwise, give some padding "almost as many estimators are tried
            # as max_iter". 3 is a fudge number chosen to be the minimum; when
            # max_iter=20, len(model_ids) == 17.
            assert len(model_ids) + 3 >= max_iter

        assert all("bracket" in id_ for id_ in model_ids)
Пример #30
0
#Create one single learner instance to be used throughout this code block instead of refitting everytime
mlpPD = MLPClassifier(hidden_layer_sizes=(5, 5),
                      max_iter=300,
                      activation='relu',
                      solver='adam',
                      learning_rate_init=0.001,
                      beta_1=0.5,
                      alpha=0.01,
                      shuffle=True)
#rfPD = BernoulliNB(alpha=0.5, binarize=0.0, fit_prior=True, class_prior=None)
#rfPD = RandomForestClassifier(n_estimators=100,random_state=RSEED,max_features='sqrt',
#                             n_jobs=-1, warm_start=True)

##Wrap learner in Incremental. Use this from now on as model. Will help with batching
learnermlpPD = Incremental(mlpPD)
#learnerrfPD = Incremental(rfPD)

#Need to get encoded fit for the first set of data and apply to all other months.
#Refitting each time causes errors
df = pd.read_csv(filenames[0])
#df['FL_DATE'] = df['FL_DATE'].astype(str)
df['OP_UNIQUE_CARRIER'] = df['OP_UNIQUE_CARRIER'].astype(str)
df['ORIGIN'] = df['ORIGIN'].astype(str)
#df['DEP_TIME'] = df['DEP_TIME'].astype(str)
x_PD = df[['FL_DATE', 'DEP_TIME', 'OP_UNIQUE_CARRIER', 'ORIGIN']]
#'DATE_INT','DEP_HOUR'
y_PD = df['DEP_DELAY_IND']
x_trainPD, x_testPD, y_trainPD, y_testPD = train_test_split(x_PD,
                                                            y_PD,
                                                            test_size=0.10,