def test_unknown_chunks_ok(fit_intercept): # https://github.com/dask/dask-ml/issues/145 X = dd.from_pandas(pd.DataFrame(np.random.uniform(size=(10, 5))), 2).values y = dd.from_pandas(pd.Series(np.random.uniform(size=(10, ))), 2).values reg = LinearRegression(fit_intercept=fit_intercept) reg.fit(X, y)
def test_lm(fit_intercept): X, y = make_regression(n_samples=100, n_features=5, chunks=50) lr = LinearRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) if fit_intercept: assert lr.intercept_ is not None
hour_no_train_X.head(1) # In[202]: hour_no_train_label = hour_no_train.loc[:,"cnt"] hour_no_train_label.head() # Now we build the most basic model with Linear Regression # In[203]: LR = LinearRegression(fit_intercept=True) # In[204]: da.LR_model_baseline = LR.fit(hour_no_train_X.values, hour_no_train_label.values) # In[205]: da.hour_no_test_X = hour_no_test.loc[:, "season":"windspeed"] # In[206]:
from sklearn.model_selection import cross_val_predict from sklearn.model_selection import cross_val_score from sklearn.metrics import r2_score import datetime as dt from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_absolute_error from sklearn.linear_model import Lasso from sklearn.linear_model import Ridge from dask_ml.linear_model import LinearRegression from dask_ml.metrics import r2_score # linear regression lr_model = LinearRegression() traintrial = hour_df[hour_df["datetime"].compute() < "2012-10-01"] testtrial = hour_df[hour_df["datetime"].compute() >= "2012-10-01"] # drop the target variable and "registered" variable since we are using registered along with the rest of features X_traintrial = traintrial.drop( ["cnt", "registered", "month", "casual", "season"], axis=1 ) y_traintrial = traintrial["cnt"] X_testtrial = testtrial.drop(["cnt", "registered", "month", "casual"], axis=1) y_testtrial = testtrial["cnt"] X_traintrial = X_traintrial.drop(["datetime"], axis=1) X_testtrial = X_testtrial.drop(["datetime"], axis=1)
def test_lr_score(): X = da.from_array(np.arange(1000).reshape(1000, 1)) lr = LinearRegression() lr.fit(X, X) assert lr.score(X, X) == pytest.approx(1, 0.001)
#defining the data and target categorical_variables = df[[ 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status' ]] target = df['Purchase'] #creating dummies for the categorical variables data = dd.get_dummies(categorical_variables.categorize()).compute() #converting dataframe to array datanew = data.values #fit the model from dask_ml.linear_model import LinearRegression lr = LinearRegression() lr.fit(datanew, target) #preparing the test data test_categorical = test[[ 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status' ]] test_dummy = dd.get_dummies(test_categorical.categorize()).compute() testnew = test_dummy.values #predict on test and upload pred = lr.predict(testnew) #Clustering/K-Means from dask_ml.cluster import KMeans
def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False): n_s = sphere.pixels.shape[0] n_v = self.u_arr.shape[0] lambduh = alpha / np.sqrt(n_s) if not usedask: gamma = self.make_gamma(sphere) logger.info("augmented: {}".format(gamma.shape)) vis_aux = vis_to_real(vis_arr) logger.info("vis mean: {} shape: {}".format( np.mean(vis_aux), vis_aux.shape)) tol = min(alpha / 1e4, 1e-10) logger.info("Solving tol={} ...".format(tol)) # reg = linear_model.ElasticNet(alpha=alpha/np.sqrt(n_s), # tol=1e-6, # l1_ratio = 0.01, # max_iter=100000, # positive=True) if False: ( sky, lstop, itn, r1norm, r2norm, anorm, acond, arnorm, xnorm, var, ) = scipy.sparse.linalg.lsqr(gamma, vis_aux, damp=alpha, show=True) logger.info( "Alpha: {}: Iterations: {}: rnorm: {}: xnorm: {}".format( alpha, itn, r2norm, xnorm)) else: reg = linear_model.Ridge(alpha=alpha, tol=tol, solver="lsqr", max_iter=100000) reg.fit(gamma, vis_aux) logger.info(" Solve Complete, iter={}".format(reg.n_iter_)) sky = da.from_array(reg.coef_) residual = vis_aux - gamma @ sky sky, residual_norm, solution_norm = da.compute( sky, np.linalg.norm(residual)**2, np.linalg.norm(sky)**2) score = reg.score(gamma, vis_aux) logger.info("Alpha: {}: Loss: {}: rnorm: {}: snorm: {}".format( alpha, score, residual_norm, solution_norm)) else: from dask_ml.linear_model import LinearRegression import dask_glm from dask.distributed import Client, LocalCluster from dask.diagnostics import ProgressBar import dask logger.info("Starting Dask Client") if True: cluster = LocalCluster(dashboard_address=":8231", processes=False) client = Client(cluster) else: client = Client("tcp://localhost:8786") logger.info("Client = {}".format(client)) harmonic_list = [] p2j = 2 * np.pi * 1.0j dl = sphere.l dm = sphere.m dn = sphere.n n_arr_minus_1 = dn - 1 du = self.u_arr dv = self.v_arr dw = self.w_arr for u, v, w in zip(du, dv, dw): harmonic = da.from_array( np.exp(p2j * (u * dl + v * dm + w * n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s, ), ) harminc = client.persist(harmonic) harmonic_list.append(harmonic) gamma = da.stack(harmonic_list) logger.info("Gamma Shape: {}".format(gamma.shape)) # gamma = gamma.reshape((n_v, n_s)) gamma = gamma.conj() gamma = client.persist(gamma) logger.info("Gamma Shape: {}".format(gamma.shape)) logger.info("Building Augmented Operator...") proj_operator_real = da.real(gamma) proj_operator_imag = da.imag(gamma) proj_operator = da.block([[proj_operator_real], [proj_operator_imag]]) proj_operator = client.persist(proj_operator) logger.info("Proj Operator shape {}".format(proj_operator.shape)) vis_aux = da.from_array( np.array( np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32, )) # logger.info("Solving...") en = dask_glm.regularizers.ElasticNet(weight=0.01) en = dask_glm.regularizers.L2() # dT = da.from_array(proj_operator, chunks=(-1, 'auto')) ##dT = da.from_array(proj_operator, chunks=(-1, 'auto')) # dv = da.from_array(vis_aux) dask.config.set({"array.chunk-size": "1024MiB"}) A = da.rechunk(proj_operator, chunks=("auto", n_s)) A = client.persist(A) y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s)) y = client.persist(y) # sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000) logger.info("Rechunking completed.. A= {}.".format(A.shape)) reg = LinearRegression( penalty=en, C=1.0 / lambduh, fit_intercept=False, solver="lbfgs", max_iter=1000, tol=1e-8, ) sky = reg.fit(A, y) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info("Loss function: {}".format(score.compute())) logger.info("Solving Complete: sky = {}".format(sky.shape)) sphere.set_visible_pixels(sky, scale=False) return sky.reshape(-1, 1)
if __name__ == '__main__': parser = argparse.ArgumentParser( description='Example of predicting departure delay from NYC flights') parser.add_argument('--data_dir', default='data', dest='data_dir', type=str) parser.add_argument( '--nyc_url', default= 'https://storage.googleapis.com/dask-tutorial-data/nycflights.tar.gz', dest='nyc_url', type=str) args = parser.parse_args() client = Client(n_workers=2) load_flights(args.data_dir, args.nyc_url) X, y = get_full_data(data_dir) X_train, X_test, X_valid, y_train, y_test, y_valid = prepare_dataset(X, y) lr = LinearRegression() test_score, valid_score = train(lr, X_train, X_test, X_valid, y_train, y_test, y_valid) print(test_score, valid_score)
from dask.distributed import Client import time import sys from dask_ml.linear_model import LinearRegression import dask.dataframe as dd client = Client(n_workers=4) t0 = time.time() data = dd.read_csv(sys.argv[1], header=None) model = LinearRegression(fit_intercept=False) reg = model.fit( data[[ 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21 ]].values, data[3].values) print(reg.coef_) print('Tiempo transcurrido:', time.time() - t0) client.close()
def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False): n_s = sphere.pixels.shape[0] n_v = self.u_arr.shape[0] lambduh = alpha/np.sqrt(n_s) if not usedask: gamma = self.make_gamma(sphere) logger.info("Building Augmented Operator...") proj_operator_real = np.real(gamma).astype(np.float32) proj_operator_imag = np.imag(gamma).astype(np.float32) gamma = None proj_operator = np.block([[proj_operator_real], [proj_operator_imag]]) proj_operator_real = None proj_operator_imag = None logger.info('augmented: {}'.format(proj_operator.shape)) vis_aux = np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32) logger.info('vis mean: {} shape: {}'.format(np.mean(vis_aux), vis_aux.shape)) logger.info("Solving...") reg = linear_model.ElasticNet(alpha=lambduh, l1_ratio=0.05, max_iter=10000, positive=True) reg.fit(proj_operator, vis_aux) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info('Loss function: {}'.format(score)) else: from dask_ml.linear_model import LinearRegression import dask_glm import dask.array as da from dask.distributed import Client, LocalCluster from dask.diagnostics import ProgressBar import dask logger.info('Starting Dask Client') if True: cluster = LocalCluster(dashboard_address=':8231', processes=False) client = Client(cluster) else: client = Client('tcp://localhost:8786') logger.info("Client = {}".format(client)) harmonic_list = [] p2j = 2*np.pi*1.0j dl = sphere.l dm = sphere.m dn = sphere.n n_arr_minus_1 = dn - 1 du = self.u_arr dv = self.v_arr dw = self.w_arr for u, v, w in zip(du, dv, dw): harmonic = da.from_array(np.exp(p2j*(u*dl + v*dm + w*n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s,)) harminc = client.persist(harmonic) harmonic_list.append(harmonic) gamma = da.stack(harmonic_list) logger.info('Gamma Shape: {}'.format(gamma.shape)) #gamma = gamma.reshape((n_v, n_s)) gamma = gamma.conj() gamma = client.persist(gamma) logger.info('Gamma Shape: {}'.format(gamma.shape)) logger.info("Building Augmented Operator...") proj_operator_real = da.real(gamma) proj_operator_imag = da.imag(gamma) proj_operator = da.block([[proj_operator_real], [proj_operator_imag]]) proj_operator = client.persist(proj_operator) logger.info("Proj Operator shape {}".format(proj_operator.shape)) vis_aux = da.from_array(np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32)) #logger.info("Solving...") en = dask_glm.regularizers.ElasticNet(weight=0.01) en = dask_glm.regularizers.L2() #dT = da.from_array(proj_operator, chunks=(-1, 'auto')) ##dT = da.from_array(proj_operator, chunks=(-1, 'auto')) #dv = da.from_array(vis_aux) dask.config.set({'array.chunk-size': '1024MiB'}) A = da.rechunk(proj_operator, chunks=('auto', n_s)) A = client.persist(A) y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s)) y = client.persist(y) #sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000) logger.info("Rechunking completed.. A= {}.".format(A.shape)) reg = LinearRegression(penalty=en, C=1.0/lambduh, fit_intercept=False, solver='lbfgs', max_iter=1000, tol=1e-8 ) sky = reg.fit(A, y) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info('Loss function: {}'.format(score.compute())) logger.info("Solving Complete: sky = {}".format(sky.shape)) sphere.set_visible_pixels(sky, scale=True) return sky.reshape(-1,1)
# In[43]: client = Client() client # #### Linear Regression # In[44]: from scikitplot.metrics import plot_calibration_curve from scikitplot.plotters import plot_learning_curve from scikitplot.estimators import plot_feature_importances # In[45]: lr = LinearRegression() with joblib.parallel_backend('dask'): lr_model = lr.fit(X_train.values, y_train.values) y_pred_lr = lr.predict(X_test.values) # In[46]: mse(y_test.values, y_pred_lr) # In[47]: r2_score(y_test.values.compute(), y_pred_lr.compute()) # ### Non Linear Models # #### Random Forest Regressor