def objective(trial): iris = load_iris() X, y = iris.data, iris.target X, y = da.from_array(X, chunks=len(X) // 5), da.from_array(y, chunks=len(y) // 5) solver = trial.suggest_categorical( "solver", ["admm", "gradient_descent", "proximal_grad"]) C = trial.suggest_float("C", 0.0, 1.0) if solver == "admm" or solver == "proximal_grad": penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elastic_net"]) else: # 'penalty' parameter isn't relevant for this solver, # so we always specify 'l2' as the dummy value. penalty = "l2" classifier = LogisticRegression(max_iter=200, solver=solver, C=C, penalty=penalty) X_train, X_valid, y_train, y_valid = train_test_split(X, y) classifier.fit(X_train, y_train) score = classifier.score(X_valid, y_valid) return score
def objective(trial): iris = load_iris() X, y = iris.data, iris.target X, y = da.from_array(X, chunks=len(X) // 5), da.from_array(y, chunks=len(y) // 5) solver = trial.suggest_categorical( 'solver', ['admm', 'gradient_descent', 'proximal_grad']) C = trial.suggest_uniform('C', 0.0, 1.0) if solver == 'admm' or solver == 'proximal_grad': penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elastic_net']) else: # 'penalty' parameter isn't relevant for this solver, # so we always specify 'l2' as the dummy value. penalty = 'l2' classifier = LogisticRegression(max_iter=200, solver=solver, C=C, penalty=penalty) X_train, X_test, y_train, y_test = train_test_split(X, y) classifier.fit(X_train, y_train) score = classifier.score(X_test, y_test) return score
def test_big(fit_intercept): X, y = make_classification(chunks=50) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.decision_function(X) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_fit_solver(solver): import dask_glm from distutils.version import LooseVersion if LooseVersion(dask_glm.__version__) <= "0.2.0": pytest.skip("FutureWarning for dask config.") X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(solver=solver) lr.fit(X, y)
def test_fit_solver(solver): import dask_glm import packaging.version if packaging.version.parse( dask_glm.__version__) <= packaging.version.parse("0.2.0"): pytest.skip("FutureWarning for dask config.") X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(solver=solver) lr.fit(X, y)
def test_dataframe_warns_about_chunks(fit_intercept): rng = np.random.RandomState(42) n, d = 20, 5 kwargs = dict(npartitions=4) X = dd.from_pandas(pd.DataFrame(rng.uniform(size=(n, d))), **kwargs) y = dd.from_pandas(pd.Series(rng.choice(2, size=n)), **kwargs) clf = LogisticRegression(fit_intercept=fit_intercept) msg = "does not support dask dataframes.*might be resolved with" with pytest.raises(TypeError, match=msg): clf.fit(X, y) clf.fit(X.values, y.values) clf.fit(X.to_dask_array(), y.to_dask_array()) clf.fit(X.to_dask_array(lengths=True), y.to_dask_array(lengths=True))
def train(X_train, y_train, out_model): lr = LogisticRegression(penalty='l2', solver='lbfgs', n_jobs=64, max_iter=10) # If leave just the dataframe, will throw an error saying "This estimator # does not support dask dataframes." lr.fit(X_train.values, y_train.values) # Saving model for later prediction pickle.dump(lr, open(out_model, "wb")) # Outputing some statistics y_train_pred = lr.predict(X_train.values) TN, FP, FN, TP = confusion_matrix_dask(y_train.values, y_train_pred) print("Read like \n[[TN, FP], \n[FN, TP]]\n", np.array([[TN, FP], [FN, TP]]))
def simple_example(): X, y = make_classification(n_samples=10000, n_features=2, chunks=50) X = dd.from_dask_array(X, columns=["a","b"]) y = dd.from_array(y) lr = LogisticRegression() lr.fit(X.values, y.values) print('Predictions =', lr.predict(X.values).compute()) print('Probabilities =', lr.predict_proba(X.values).compute()) print('Scores =', lr.score(X.values, y.values).compute())
def test_fit(fit_intercept, solver): X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X)
def test_lr_init(solver): LogisticRegression(solver=solver)
def test_logistic_predict_proba_shape(): X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression() lr.fit(X, y) prob = lr.predict_proba(X) assert prob.shape == (100, 2)
def test_gridsearch(): X, y = make_classification(n_samples=100, n_features=5, chunks=50) grid = {"logisticregression__C": [1000, 100, 10, 2]} pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) search = GridSearchCV(pipe, grid, cv=3) search.fit(X, y)
# ('ohe_3', OneHotEncoder(sparse=False), [2]), # ('ohe_4', OneHotEncoder(sparse=False), [1]), # ('ohe_5', OneHotEncoder(sparse=False), [1]), # ('ohe_6', OneHotEncoder(sparse=False), [1]), # ('ohe_7', OneHotEncoder(sparse=False), [1]), # ('ohe_8', OneHotEncoder(sparse=False), [1]), # ('ohe_9', OneHotEncoder(sparse=False), [1]), ('hod_0', HourOfDayFromDatetimeString(), [0]), ('hod_1', HourOfDayFromDatetimeString(), [0]), ('hod_2', HourOfDayFromDatetimeString(), [0]), ('hod_3', HourOfDayFromDatetimeString(), [0]), ('hod_4', HourOfDayFromDatetimeString(), [0]), ('hod_5', HourOfDayFromDatetimeString(), [0]), ('hod_6', HourOfDayFromDatetimeString(), [0]), ('hod_7', HourOfDayFromDatetimeString(), [0]), ('hod_8', HourOfDayFromDatetimeString(), [0]), ('hod_9', HourOfDayFromDatetimeString(), [0])]) ), ('clf', LogisticRegression()) ]) pipe.fit(X, y) pipe_runtime = pipe.to_runtime() print_speed_comparison(X, pipe, pipe_runtime) # Save for model serving init('./model_repo', pipe_runtime, df[FEATURES].head()) client.close()
import dask.dataframe as dd import dask.datasets as ds import time from dask_ml.linear_model import LogisticRegression from dask_glm.datasets import make_classification X, y = make_classification(n_samples=1000) t = time.time() lr = LogisticRegression() lr.fit(X, y) lr.predict(X) lr.predict_proba(X) #est.score(X, y) print('\nTime dask_ml: ' + str(time.time() - t)) # Parallelize Scikit-Learn Directly from dask.distributed import Client from sklearn.externals.joblib import parallel_backend client = Client('localhost:8786') # Connect to a Dask Cluster print(client) with parallel_backend('dask', scatter=[X, y]): # Your normal scikit-learn code here t = time.time() lr = LogisticRegression() lr.fit(X, y) lr.predict(X) lr.predict_proba(X) #est.score(X, y) print('\nTime dask_ml distributed: ' + str(time.time() - t))
def test_fit_solver(solver): X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(solver=solver) lr.fit(X, y)
return da.concatenate(chunks, axis=0) # Test-train split from dask_ml.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(to_dask_array(X), to_dask_array(y), random_state=99) ################################################################################### # Fitting the Logistic Regression Classifier from dask_ml.linear_model import LogisticRegression lr = LogisticRegression() with ProgressBar(): lr.fit(X_train, y_train) print('Logistic Regression Score : ', lr.score(X_test, y_test).compute()) ##### OUTPUT --------> Logistic Regression Score : 0.70025 ##################################################################################### # Fitting the Naive Bayes Classifier from sklearn.naive_bayes import BernoulliNB from dask_ml.wrappers import Incremental nb = BernoulliNB()
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split from dask_ml.model_selection import GridSearchCV from dask.distributed import Client from sklearn.pipeline import make_pipeline from dask_ml.preprocessing import StandardScaler from dask_ml.linear_model import LogisticRegression if __name__ == "__main__": client = Client() data = Path('./data') df = pd.read_csv(data / "01_heights_weights_genders.csv") y = 1 * (df.Gender == "Male").values X = df[['Height', 'Weight']].values X_train, X_test, y_train, y_test = train_test_split(X, y) pipeline = make_pipeline(StandardScaler(), LogisticRegression()) grid = GridSearchCV(pipeline, param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5) grid.fit(X_train, y_train) print("Score", grid.score(X_test, y_test))
def __init__(self, api, policy='lookaround', model=LogisticRegression(solver='admm'), broker_commission=0.003): self.api = api self.broker_commission = broker_commission self.model = model self.policy = policy
def test_in_pipeline(): X, y = make_classification(n_samples=100, n_features=5, chunks=50) pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) pipe.fit(X, y)