Пример #1
0
def objective(trial):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = da.from_array(X,
                         chunks=len(X) // 5), da.from_array(y,
                                                            chunks=len(y) // 5)

    solver = trial.suggest_categorical(
        "solver", ["admm", "gradient_descent", "proximal_grad"])
    C = trial.suggest_float("C", 0.0, 1.0)

    if solver == "admm" or solver == "proximal_grad":
        penalty = trial.suggest_categorical("penalty",
                                            ["l1", "l2", "elastic_net"])
    else:
        # 'penalty' parameter isn't relevant for this solver,
        # so we always specify 'l2' as the dummy value.
        penalty = "l2"

    classifier = LogisticRegression(max_iter=200,
                                    solver=solver,
                                    C=C,
                                    penalty=penalty)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    classifier.fit(X_train, y_train)

    score = classifier.score(X_valid, y_valid)
    return score
Пример #2
0
def objective(trial):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = da.from_array(X,
                         chunks=len(X) // 5), da.from_array(y,
                                                            chunks=len(y) // 5)

    solver = trial.suggest_categorical(
        'solver', ['admm', 'gradient_descent', 'proximal_grad'])
    C = trial.suggest_uniform('C', 0.0, 1.0)

    if solver == 'admm' or solver == 'proximal_grad':
        penalty = trial.suggest_categorical('penalty',
                                            ['l1', 'l2', 'elastic_net'])
    else:
        # 'penalty' parameter isn't relevant for this solver,
        # so we always specify 'l2' as the dummy value.
        penalty = 'l2'

    classifier = LogisticRegression(max_iter=200,
                                    solver=solver,
                                    C=C,
                                    penalty=penalty)

    X_train, X_test, y_train, y_test = train_test_split(X, y)
    classifier.fit(X_train, y_train)

    score = classifier.score(X_test, y_test)
    return score
Пример #3
0
def test_big(fit_intercept):
    X, y = make_classification(chunks=50)
    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.decision_function(X)
    lr.predict(X)
    lr.predict_proba(X)
    if fit_intercept:
        assert lr.intercept_ is not None
Пример #4
0
def test_fit_solver(solver):
    import dask_glm
    from distutils.version import LooseVersion

    if LooseVersion(dask_glm.__version__) <= "0.2.0":
        pytest.skip("FutureWarning for dask config.")

    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(solver=solver)
    lr.fit(X, y)
Пример #5
0
def test_fit_solver(solver):
    import dask_glm
    import packaging.version

    if packaging.version.parse(
            dask_glm.__version__) <= packaging.version.parse("0.2.0"):
        pytest.skip("FutureWarning for dask config.")

    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(solver=solver)
    lr.fit(X, y)
Пример #6
0
def test_dataframe_warns_about_chunks(fit_intercept):
    rng = np.random.RandomState(42)
    n, d = 20, 5
    kwargs = dict(npartitions=4)
    X = dd.from_pandas(pd.DataFrame(rng.uniform(size=(n, d))), **kwargs)
    y = dd.from_pandas(pd.Series(rng.choice(2, size=n)), **kwargs)
    clf = LogisticRegression(fit_intercept=fit_intercept)
    msg = "does not support dask dataframes.*might be resolved with"
    with pytest.raises(TypeError, match=msg):
        clf.fit(X, y)
    clf.fit(X.values, y.values)
    clf.fit(X.to_dask_array(), y.to_dask_array())
    clf.fit(X.to_dask_array(lengths=True), y.to_dask_array(lengths=True))
Пример #7
0
def train(X_train, y_train, out_model):
    lr = LogisticRegression(penalty='l2',
                            solver='lbfgs',
                            n_jobs=64,
                            max_iter=10)
    # If leave just the dataframe, will throw an error saying "This estimator
    # does not support dask dataframes."
    lr.fit(X_train.values, y_train.values)

    # Saving model for later prediction
    pickle.dump(lr, open(out_model, "wb"))

    # Outputing some statistics
    y_train_pred = lr.predict(X_train.values)

    TN, FP, FN, TP = confusion_matrix_dask(y_train.values, y_train_pred)
    print("Read like \n[[TN, FP], \n[FN, TP]]\n", np.array([[TN, FP], [FN,
                                                                       TP]]))
Пример #8
0
def simple_example():
	X, y = make_classification(n_samples=10000, n_features=2, chunks=50)

	X = dd.from_dask_array(X, columns=["a","b"])
	y = dd.from_array(y)

	lr = LogisticRegression()
	lr.fit(X.values, y.values)

	print('Predictions =', lr.predict(X.values).compute())
	print('Probabilities =', lr.predict_proba(X.values).compute())
	print('Scores =', lr.score(X.values, y.values).compute())
Пример #9
0
def test_fit(fit_intercept, solver):
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
Пример #10
0
def test_lr_init(solver):
    LogisticRegression(solver=solver)
Пример #11
0
def test_logistic_predict_proba_shape():
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression()
    lr.fit(X, y)
    prob = lr.predict_proba(X)
    assert prob.shape == (100, 2)
Пример #12
0
def test_gridsearch():
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    grid = {"logisticregression__C": [1000, 100, 10, 2]}
    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
    search = GridSearchCV(pipe, grid, cv=3)
    search.fit(X, y)
Пример #13
0
            # ('ohe_3', OneHotEncoder(sparse=False), [2]),
            # ('ohe_4', OneHotEncoder(sparse=False), [1]),
            # ('ohe_5', OneHotEncoder(sparse=False), [1]),
            # ('ohe_6', OneHotEncoder(sparse=False), [1]),
            # ('ohe_7', OneHotEncoder(sparse=False), [1]),
            # ('ohe_8', OneHotEncoder(sparse=False), [1]),
            # ('ohe_9', OneHotEncoder(sparse=False), [1]),
            ('hod_0', HourOfDayFromDatetimeString(), [0]),
            ('hod_1', HourOfDayFromDatetimeString(), [0]),
            ('hod_2', HourOfDayFromDatetimeString(), [0]),
            ('hod_3', HourOfDayFromDatetimeString(), [0]),
            ('hod_4', HourOfDayFromDatetimeString(), [0]),
            ('hod_5', HourOfDayFromDatetimeString(), [0]),
            ('hod_6', HourOfDayFromDatetimeString(), [0]),
            ('hod_7', HourOfDayFromDatetimeString(), [0]),
            ('hod_8', HourOfDayFromDatetimeString(), [0]),
            ('hod_9', HourOfDayFromDatetimeString(), [0])])
        ),
        ('clf', LogisticRegression())
    ])

    pipe.fit(X, y)
    pipe_runtime = pipe.to_runtime()

    print_speed_comparison(X, pipe, pipe_runtime)

    # Save for model serving
    init('./model_repo', pipe_runtime, df[FEATURES].head())

    client.close()
import dask.dataframe as dd
import dask.datasets as ds
import time
from dask_ml.linear_model import LogisticRegression
from dask_glm.datasets import make_classification

X, y = make_classification(n_samples=1000)

t = time.time()
lr = LogisticRegression()
lr.fit(X, y)
lr.predict(X)
lr.predict_proba(X)
#est.score(X, y)
print('\nTime dask_ml: ' + str(time.time() - t))

# Parallelize Scikit-Learn Directly
from dask.distributed import Client
from sklearn.externals.joblib import parallel_backend

client = Client('localhost:8786')  # Connect to a Dask Cluster
print(client)
with parallel_backend('dask', scatter=[X, y]):
    # Your normal scikit-learn code here
    t = time.time()
    lr = LogisticRegression()
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
    #est.score(X, y)
    print('\nTime dask_ml distributed: ' + str(time.time() - t))
Пример #15
0
def test_fit_solver(solver):
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(solver=solver)
    lr.fit(X, y)
    return da.concatenate(chunks, axis=0)


# Test-train split
from dask_ml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(to_dask_array(X),
                                                    to_dask_array(y),
                                                    random_state=99)

###################################################################################

# Fitting the Logistic Regression Classifier
from dask_ml.linear_model import LogisticRegression

lr = LogisticRegression()

with ProgressBar():
    lr.fit(X_train, y_train)

print('Logistic Regression Score : ', lr.score(X_test, y_test).compute())
##### OUTPUT --------> Logistic Regression Score :  0.70025

#####################################################################################

# Fitting the Naive Bayes Classifier
from sklearn.naive_bayes import BernoulliNB
from dask_ml.wrappers import Incremental

nb = BernoulliNB()
Пример #17
0
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb

from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV
from dask.distributed import Client
from sklearn.pipeline import make_pipeline
from dask_ml.preprocessing import StandardScaler
from dask_ml.linear_model import LogisticRegression

if __name__ == "__main__":
    client = Client()
    data = Path('./data')
    df = pd.read_csv(data / "01_heights_weights_genders.csv")
    y = 1 * (df.Gender == "Male").values
    X = df[['Height', 'Weight']].values
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    pipeline = make_pipeline(StandardScaler(), LogisticRegression())
    grid = GridSearchCV(pipeline,
                        param_grid={'logisticregression__C': [.1, 1, 10, 100]},
                        cv=5)
    grid.fit(X_train, y_train)
    print("Score", grid.score(X_test, y_test))
Пример #18
0
 def __init__(self, api, policy='lookaround', model=LogisticRegression(solver='admm'), broker_commission=0.003):
     self.api = api
     self.broker_commission = broker_commission
     self.model = model
     self.policy = policy
Пример #19
0
def test_in_pipeline():
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
    pipe.fit(X, y)