def __init__(self, method, yrange, params, i=0, ransacparams={}): self.method = method self.outliers = None self.inliers = None self.ransac = False self.yrange = yrange[i] if self.method[i] == 'PLS': self.model = PLSRegression(**params[i]) if self.method[i] == 'OLS': self.model = linear.LinearRegression(**params[i]) if self.method[i] == 'OMP': #check whether to do CV or not self.do_cv = params[i]['CV'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV parameter params_temp.pop('CV') if self.do_cv is False: self.model = linear.OrthogonalMatchingPursuit(**params_temp) else: params_temp.pop('n_nonzero_coefs') self.model = linear.OrthogonalMatchingPursuitCV(**params_temp) if self.method[i] == 'Lasso': # check whether to do CV or not self.do_cv = params[i]['CV'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV parameter params_temp.pop('CV') if self.do_cv is False: self.model = linear.Lasso(**params_temp) else: params_temp.pop('alpha') self.model = linear.LassoCV(**params_temp) if self.method[i] == 'Elastic Net': # check whether to do CV or not self.do_cv = params[i]['CV'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV parameter params_temp.pop('CV') if self.do_cv is False: self.model = linear.ElasticNet(**params_temp) else: params_temp.pop('alpha') self.model = linear.ElasticNetCV(**params_temp) if self.method[i] == 'Ridge': # check whether to do CV or not self.do_cv = params[i]['CV'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV parameter params_temp.pop('CV') if self.do_cv is False: self.model = linear.Ridge(**params_temp) else: #Ridge requires a specific set of alphas to be provided... this needs more work to be implemented correctly self.model = linear.RidgeCV(**params_temp) if self.method[i] == 'Bayesian Ridge': self.model = linear.BayesianRidge(**params[i]) if self.method[i] == 'ARD': self.model = linear.ARDRegression(**params[i]) if self.method[i] == 'LARS': # check whether to do CV or not self.do_cv = params[i]['CV'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV parameter params_temp.pop('CV') if self.do_cv is False: self.model = linear.Lars(**params_temp) else: self.model = linear.LarsCV(**params_temp) if self.method[i] == 'Lasso LARS': # check whether to do CV or not self.do_cv = params[i]['CV'] # check whether to do IC or not self.do_ic = params[i]['IC'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV and IC parameter params_temp.pop('CV') params_temp.pop('IC') if self.do_cv is False and self.do_ic is False: self.model = linear.LassoLars(**params[i]) if self.do_cv is True and self.do_ic is False: self.model = linear.LassoLarsCV(**params[i]) if self.do_cv is False and self.do_ic is True: self.model = linear.LassoLarsIC(**params[i]) if self.do_cv is True and self.do_ic is True: print( "Can't use both cross validation AND information criterion to optimize!" ) if self.method[i] == 'SVR': self.model = svm.SVR(**params[i]) if self.method[i] == 'KRR': self.model = kernel_ridge.KernelRidge(**params[i]) if self.method[i] == 'GP': #get the method for dimensionality reduction and the number of components self.reduce_dim = params[i]['reduce_dim'] self.n_components = params[i]['n_components'] #create a temporary set of parameters params_temp = copy.copy(params[i]) #Remove parameters not accepted by Gaussian Process params_temp.pop('reduce_dim') params_temp.pop('n_components') self.model = GaussianProcess(**params_temp)
######################################################################################################################## r1 = linear_model.LinearRegression(normalize=True, n_jobs=29) r2 = ensemble.RandomForestRegressor(max_depth=3, min_samples_split=2, random_state=0, n_estimators=700) r3 = ensemble.AdaBoostRegressor(random_state=0, loss='linear', learning_rate=3.0, n_estimators=700) r4 = ensemble.GradientBoostingRegressor() r5 = ensemble.BaggingRegressor() # overfitting r6 = ensemble.ExtraTreesRegressor() # overfitting r7 = linear_model.BayesianRidge(normalize=True) r8 = linear_model.ARDRegression(normalize=True) r9 = linear_model.HuberRegressor() r10 = linear_model.Lasso(random_state=0, selection='cyclic', normalize=False) r11 = svm.LinearSVR(random_state=0, loss='squared_epsilon_insensitive', dual=True) r12 = gaussian_process.GaussianProcessRegressor() # overfitting r13 = linear_model.PassiveAggressiveRegressor() # takes okayisch time r14 = linear_model.RANSACRegressor() # overfitting? r15 = linear_model.SGDRegressor(shuffle=True, penalty='l1', loss='squared_epsilon_insensitive', learning_rate='invscaling', epsilon=0.1, early_stopping=False, average=True)
import time import numpy as np import matplotlib.pyplot as plt import pandas from sklearn import linear_model from sklearn import datasets dataset = pandas.read_csv('clean6001.csv') array = dataset.values[:2000] X = array[:, 3:5] y = array[:, 6] model_aic = linear_model.ARDRegression() model_aic.fit(X, y) y_aic = model_aic.predict(X) Y_validation = y plt.scatter(range(len(X))[0:2000], Y_validation[:2000], color='orange') plt.plot(range(len(X))[0:2000], y_aic[:2000], color='red', linewidth=3) plt.show()
columns=cols_dynamicRes) # training results #X_train1_dynamic, X_test1_dynamic, y_train1_dynamic, y_test1_dynamic = train_test_split(dfArr1_dynamic, dfRes1_dynamic, test_size=0.2) #print (X_train1_dynamic.shape, y_train1_dynamic.shape) #print (X_test1_dynamic.shape, y_test1_dynamic.shape) #feat_extr = SelectKBest(k=7) #fitter = feat_extr.fit(dfArr1_dynamic, ravel(dfRes1_dynamic)) #scores1 = fitter.scores_ #scores = pd.DataFrame(fitter.scores_, index=cols_dynamicAttr) #model = ExtraTreesClassifier() #model = model.fit(dfArr1_dynamic, ravel(dfRes1_dynamic)) #model_scores = pd.DataFrame(model.feature_importances_, index=cols_dynamicAttr) #rlasso = RandomizedLasso() #lasso = rlasso.fit(dfArr1_dynamic, ravel(dfRes1_dynamic)) #lasso_scores = pd.DataFrame(lasso.scores_, index=cols_dynamicAttr) ard = linear_model.ARDRegression(compute_score=True) autorelevdet = ard.fit(dfArr1_dynamic, ravel(dfRes1_dynamic)) ard_scores = pd.DataFrame(autorelevdet.scores_, index=cols_dynamicAttr) ard_coef = pd.DataFrame(autorelevdet.coef_, index=cols_dynamicAttr)
def revenue_growth_model(ticker): financial_data = scraper.getFinancialData(ticker) revenue = financial_data["Revenue"] df = pd.DataFrame.from_dict(revenue.items()) x = df[0].to_frame() # x-values are the years y = df[1].to_frame() # y-values are revenue values (given) ### linear modeling ### """ make the models """ ols_reg = linear_model.LinearRegression() #ordinary least squares ridge_reg = linear_model.Ridge() #ridge regression lasso_reg = linear_model.Lasso() #lasso regression LARS_reg = linear_model.LassoLars() #least angle regression (on lasso) b_ridge_reg = linear_model.BayesianRidge() #bayesian ridge regression ard_reg = linear_model.ARDRegression() #bayesian ARD regression sgd_reg = linear_model.SGDRegressor( ) #stochastic gradient descent regression ransac_model = linear_model.RANSACRegressor( ols_reg) #fit linear model with RANdom SAmple Consensus algorithm """ fit the models to a regression function based on data """ ols_reg.fit(x, y) ridge_reg.fit(x, y) lasso_reg.fit(x, y) LARS_reg.fit(x, y) b_ridge_reg.fit(x, y) ard_reg.fit(x, y) sgd_reg.fit(x, y) ransac_model.fit(x, y) ### k-cross validation ### cv_scores = { 'ols_scores': ols_reg.score(x, y), 'ridge_scores': ridge_reg.score(x, y), 'lasso_scores': lasso_reg.score(x, y), 'LARS_scores': LARS_reg.score(x, y), 'b_ridge_scores': b_ridge_reg.score(x, y), 'ard_scores': ard_reg.score(x, y), 'sgd_scores': sgd_reg.score(x, y), 'ransac_scores': ransac_model.score(x, y) } vals = list(cv_scores.values()) keys = list(cv_scores.keys()) max_cv = keys[vals.index(max(vals))] print vals print max_cv predicted = [] if max_cv == 'ols_scores': predicted = ols_reg.predict(x) elif max_cv == 'ridge_scores': predicted = ridge_reg.predict(x) elif max_cv == 'lasso_scores': predicted = lasso_reg.predict(x) elif max_cv == 'LARS_scores': predicted = LARS_reg.predict(x) elif max_cv == 'b_ridge_scores': predicted = b_ridge_reg.predict(x) elif max_cv == 'ard_scores': predicted = ard_reg.predict(x) elif max_cv == 'sgd_scores': predicted = sgd_reg.predict(x) else: predicted = ransac_model.predict(x) return {'x': x, 'y': y, 'max_cv': max_cv, 'predicted': predicted}
def __init__(self, method, yrange, params, i=0): #TODO: yrange doesn't currently do anything. Remove or do something with it! self.algorithm_list = ['PLS', 'GP', 'OLS', 'OMP', 'Lasso', 'Elastic Net', 'Ridge', 'Bayesian Ridge', 'ARD', 'LARS', 'LASSO LARS', 'SVR', 'KRR', 'GBR' ] self.method = method self.outliers = None self.ransac = False #print(params) if self.method[i] == 'PLS': self.model = PLSRegression(**params[i]) if self.method[i] == 'OLS': self.model = linear.LinearRegression(**params[i]) if self.method[i] == 'OMP': # create a temporary set of parameters params_temp = copy.copy(params[i]) self.model = linear.OrthogonalMatchingPursuit(**params_temp) if self.method[i] == 'LASSO': # create a temporary set of parameters params_temp = copy.copy(params[i]) self.model = linear.Lasso(**params_temp) if self.method[i] == 'Elastic Net': params_temp = copy.copy(params[i]) self.model = linear.ElasticNet(**params_temp) if self.method[i] == 'Ridge': # create a temporary set of parameters params_temp = copy.copy(params[i]) self.model = linear.Ridge(**params_temp) if self.method[i] == 'BRR': self.model = linear.BayesianRidge(**params[i]) if self.method[i] == 'ARD': self.model = linear.ARDRegression(**params[i]) if self.method[i] == 'LARS': # create a temporary set of parameters params_temp = copy.copy(params[i]) self.model = linear.Lars(**params_temp) if self.method[i] == 'LASSO LARS': self.model = linear.LassoLars(**params) if self.method[i] == 'SVR': self.model = svm.SVR(**params[i]) if self.method[i] == 'KRR': self.model = kernel_ridge.KernelRidge(**params[i]) if self.method[i] == 'GP': # get the method for dimensionality reduction and the number of components self.reduce_dim = params[i]['reduce_dim'] self.n_components = params[i]['n_components'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove parameters not accepted by Gaussian Process params_temp.pop('reduce_dim') params_temp.pop('n_components') self.model = GaussianProcessRegressor(**params_temp) if self.method[i] == 'GBR': self.model = GradientBoostingRegressor(**params[i])
def compute_regression_model(y, xs, years, country_list, target, ks): countries_list_iso3 = [ pycountry.countries.get(name=country).alpha_3 for country in country_list ] idx = pd.MultiIndex.from_product([countries_list_iso3, years], names=["Country", "Year"]) col = ["Predicted"] prediction_df = pd.DataFrame('-', idx, col) res = defaultdict(dict) for country in countries_list_iso3: #country = pycountry.countries.get(name=c).alpha_3 '''temp = xs_additional.loc[(years, country), :] temp.index = temp.index.droplevel(1) temp = pd.concat([temp for i in range(len(xs.index.levels[0].tolist())) ], keys=xs.index.levels[0].tolist(), names=['Province']) xs_plus = xs.copy() xs_plus = pd.concat([xs_plus, temp], axis=1) df = bdf.filter_origin_country_dataset(y, country, years, [target], xs_plus, 2)''' df = bdf.filter_origin_country_dataset(y, country, years, [target], xs, 2) df = df.reset_index(level=0, drop=True) X = df.drop(["y"], axis=1) y_temp = df["y"] f_regression_norm = normalize( (f_regression(X, y_temp)[0]).reshape(1, -1))[0] mutual_info_regression_norm = normalize( mutual_info_regression(X, y_temp).reshape(1, -1))[0] scorers_aggregation = sum( [f_regression_norm, mutual_info_regression_norm]) scorers_aggregation_norm = normalize(scorers_aggregation.reshape( 1, -1))[0] scorers_list = [ "f_regression_norm", "mutual_info_regression_norm", "scorers_aggregation_norm" ] models_function = [ linear_model.LinearRegression(normalize=True), linear_model.LassoCV(alphas=[0.01, 0.05, 0.1, 1], normalize=True), linear_model.RidgeCV(alphas=[0.01, 0.05, 0.1, 1], normalize=True), linear_model.BayesianRidge(normalize=True), linear_model.ARDRegression(normalize=True) ] model = [] mse = [] features = [] for scorer in scorers_list: #print(scorer) model_temp_k = [] mse_temp_k = [] features_temp_k = [] for k in ks: temp = mse_best_model(X, y_temp, vars()[scorer], k, models_function) model_temp_k.append(temp[0]) mse_temp_k.append(temp[1]) features_temp_k.append(temp[2]) model.append(model_temp_k[mse_temp_k.index(min(mse_temp_k))]) mse.append(min(mse_temp_k)) features.append(features_temp_k[mse_temp_k.index(min(mse_temp_k))]) model = model[mse.index(min(mse))] features = features[mse.index(min(mse))] clf = [ reg for reg in models_function if model == str(reg).split("(")[0] ][0].fit(X[features], y_temp) prediction = clf.predict(X[features]) prediction_df.loc[(country, years), "Predicted"] = prediction print(country) res[country]["features"] = features res[country]["coefficients"] = np.concatenate( (np.array([clf.intercept_]), clf.coef_)) res[country]["model"] = model #prediction_df.index = years prediction_df = prediction_df.swaplevel() prediction_df = prediction_df.sort_index() return (prediction_df, res)
def training(request, model): acao = request.session['acao'] bolsa = pd.read_csv("app/data/bolsa.csv", index_col='Date').groupby('Codigo') dados = bolsa.get_group(acao) X = dados[['Open', 'High', 'Low', 'Close', 'Volume']] y = dados['High'].shift(-1).fillna(method='pad') Y = pd.DataFrame({ 'Alta_real': dados['High'].shift(-1).fillna(method='pad'), 'Baixa_real': dados['Low'].shift(-1).fillna(method='pad') }) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=False, random_state=0) X_train, X_test, Ytrain, Ytest = train_test_split(X, Y, test_size=0.20, shuffle=False, random_state=0) base = dados.to_html() #training regr = linear_model.BayesianRidge() regr.fit(X_train, y_train) #trainingmulti if (model == 'adr'): modelo = "Automatic Relevance Determination Regression" #regr_multi = MultiOutputRegressor(svm.SVR()) regr_multi = MultiOutputRegressor( linear_model.ARDRegression(compute_score=True)) elif (model == 'ada'): modelo = "Ada Regressor" regr_multi = MultiOutputRegressor( AdaBoostRegressor(random_state=0, n_estimators=100)) elif (model == 'GB'): modelo = "GradientBoostingRegressor" regr_multi = MultiOutputRegressor( GradientBoostingRegressor(random_state=1, n_estimators=10)) else: modelo = "LinerRegression com Bayesian Ridge" regr_multi = MultiOutputRegressor(linear_model.BayesianRidge()) """ # import votingregressor não funciona, precisa atualizar o sklearn elif (model == 'VR'): modelo = "Voting Regressor com GradientBoostingRegressor, RandomForestRegressor, LinearRegression" reg1 = MultiOutputRegressor(GradientBoostingRegressor(random_state=1, n_estimators=10)) reg2 = MultiOutputRegressor(RandomForestRegressor(random_state=1, n_estimators=10)) reg3 = MultiOutputRegressor(LinearRegression()) regr_multi = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)]) """ regr_multi.fit(X_train, Ytrain) Y_PRED = regr_multi.predict(X_test) real = pd.DataFrame(Ytest) previsto = pd.DataFrame(Y_PRED, index=Ytest.index, columns=['Alta_prevista', 'Baixa_prevista']) #real.rename(columns={"High": "real"}) #previsto = previsto.set_index(real.index) data = pd.concat([real, previsto], axis=1) data['diferenca_alta'] = data['Alta_real'] - data['Alta_prevista'] data['diferenca_baixa'] = data['Baixa_real'] - data['Baixa_prevista'] erro = data['diferenca_alta'] data = data.to_html() #data = previsto.head().to_html() """ #forecast y_pred = regr.predict(X_test) real = pd.DataFrame(y_test) previsto = pd.DataFrame(y_pred, index=real.index, columns=['previsto']) #real.rename(columns={"High": "real"}) #previsto = previsto.set_index(real.index) data = pd.concat([real,previsto],axis=1) data['diferenca'] = data['High']-data['previsto'] erro = np.array(data['diferenca']) data = data.to_html() #data = previsto.head().to_html() """ #metrics mae = mean_absolute_error(Ytest, Y_PRED) mse = mean_squared_error(Ytest, Y_PRED) ev = explained_variance_score(Ytest, Y_PRED, multioutput='uniform_average') r2 = r2_score(Ytest, Y_PRED) #chart plt.figure(figsize=(5, 5)) plt.xlabel("Data") plt.ylabel("High") plt.title(acao) #plt.plot(y_train) plt.plot(Ytest['Alta_real']) plt.plot(previsto['Alta_prevista']) #plt.grid(True) plt.savefig("media/forecast_reg.png") plt.figure(figsize=(5, 5)) plt.title('Erro Alta (real - prevista)4') plt.grid(True) plt.hist(erro, bins=5) plt.savefig("media/hist_reg.png") #params params = regr.get_params() #persistence if (model == 'VR'): dump(regr_multi, 'app/learners/' + acao + '_VR.joblib') elif (model == 'GB'): dump(regr_multi, 'app/learners/' + acao + '_GB.joblib') elif (model == 'adr'): dump(regr_multi, 'app/learners/' + acao + '_ADR.joblib') elif (model == 'ada'): dump(regr_multi, 'app/learners/' + acao + '_ADAR.joblib') else: dump(regr_multi, 'app/learners/' + acao + '_NBR.joblib') context = { 'title': 'Treino Regressão', 'mae': mae, 'mse': mse, 'ev': ev, 'r2': r2, 'base': base, 'data': data, 'acao': acao, 'modelo': modelo, 'params': params, 'multi': Y_PRED[0] } return render(request, 'app/training.html', context)
poly2coefs = poly.polyfit(x, y, 2) poly2fit = poly.polyval(x_new, poly2coefs) fit_dic['poly2'] = poly2fit if 'poly3' in fits: poly3coefs = poly.polyfit(x, y, 3) poly3fit = poly.polyval(x_new, poly3coefs) fit_dic['poly3'] = poly3fit if 'spline' in fits: spline_params = splrep(x, y, s=s, k=3) splinefit = splev(x_new, spline_params) fit_dic['spline'] = splinefit return fit_dic modeldict = { 'ardregression': lm.ARDRegression(), 'bayesianridge': lm.BayesianRidge(), 'elasticnet': lm.ElasticNet(), 'elasticnetcv': lm.ElasticNetCV(), 'huberregression': lm.HuberRegressor(), 'lars': lm.Lars(), 'larscv': lm.LarsCV(), 'lasso': lm.Lasso(), 'lassocv': lm.LassoCV(), 'lassolars': lm.LassoLars(), 'lassolarscv': lm.LassoLarsCV(), 'lassolarsic': lm.LassoLarsIC(), 'linearregression': lm.LinearRegression(), 'orthogonalmatchingpursuit': lm.OrthogonalMatchingPursuit(), 'orthogonalmatchingpursuitcv': lm.OrthogonalMatchingPursuitCV(), 'passiveagressiveregressor': lm.PassiveAggressiveRegressor(),
def fit_regression(P, x, u, rule="LS", retall=False, **kws): """ Fit a polynomial chaos expansion using linear regression. Args: P (Poly) : Polynomial expansion with `P.shape=(M,)` and `P.dim=D`. x (array_like) : Collocation nodes with `x.shape=(D,K)`. u (array_like) : Model evaluations with `len(u)=K`. retall (bool) : If True return Fourier coefficients in addition to R. rule (str) : Regression method used. Returns: (Poly, np.ndarray) : Fitted polynomial with `R.shape=u.shape[1:]` and `R.dim=D`. The Fourier coefficients in the estimation. Examples: >>> x, y = cp.variable(2) >>> P = cp.Poly([1, x, y]) >>> s = [[-1,-1,1,1], [-1,1,-1,1]] >>> u = [0,1,1,2] >>> print(cp.around(fit_regression(P, s, u), 14)) 0.5q0+0.5q1+1.0 """ x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, *x.shape) u = np.array(u) Q = P(*x).T shape = u.shape[1:] u = u.reshape(u.shape[0], int(np.prod(u.shape[1:]))) rule = rule.upper() # Local rules if rule == "LS": uhat = linalg.lstsq(Q, u)[0].T elif rule == "T": uhat, alphas = rlstsq(Q, u, kws.get("order", 0), kws.get("alpha", None), False, True) uhat = uhat.T elif rule == "TC": uhat = rlstsq(Q, u, kws.get("order", 0), kws.get("alpha", None), True) uhat = uhat.T else: # Scikit-learn wrapper try: _ = linear_model except: raise NotImplementedError("sklearn not installed") if rule == "BARD": solver = linear_model.ARDRegression(fit_intercept=False, copy_X=False, **kws) elif rule == "BR": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.BayesianRidge(**kws) elif rule == "EN": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.ElasticNet(**kws) elif rule == "ENC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.ElasticNetCV(**kws) elif rule == "LA": # success kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.Lars(**kws) elif rule == "LAC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.LarsCV(**kws) elif rule == "LAS": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.Lasso(**kws) elif rule == "LASC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.LassoCV(**kws) elif rule == "LL": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.LassoLars(**kws) elif rule == "LLC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.LassoLarsCV(**kws) elif rule == "LLIC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = linear_model.LassoLarsIC(**kws) elif rule == "OMP": solver = linear_model.OrthogonalMatchingPursuit(**kws) uhat = solver.fit(Q, u).coef_ u = u.reshape(u.shape[0], *shape) R = cp.poly.sum((P * uhat), -1) R = cp.poly.reshape(R, shape) if retall == 1: return R, uhat elif retall == 2: if rule == "T": return R, uhat, Q, alphas return R, uhat, Q return R
def fit_regression(P, x, u, rule="LS", retall=False, **kws): """ Fit a polynomial chaos expansion using linear regression. Parameters ---------- P : Poly Polynomial chaos expansion with `P.shape=(M,)` and `P.dim=D`. x : array_like Collocation nodes with `x.shape=(D,K)`. u : array_like Model evaluations with `len(u)=K`. retall : bool If True return uhat in addition to R rule : str Regression method used. The follwong methods uses scikits-learn as backend. See `sklearn.linear_model` for more details. Key Scikit-learn Description --- ------------ ----------- Parameters Description ---------- ----------- "BARD" ARDRegression Bayesian ARD Regression n_iter=300 Maximum iterations tol=1e-3 Optimization tolerance alpha_1=1e-6 Gamma scale parameter alpha_2=1e-6 Gamma inverse scale parameter lambda_1=1e-6 Gamma shape parameter lambda_2=1e-6 Gamma inverse scale parameter threshold_lambda=1e-4 Upper pruning threshold "BR" BayesianRidge Bayesian Ridge Regression n_iter=300 Maximum iterations tol=1e-3 Optimization tolerance alpha_1=1e-6 Gamma scale parameter alpha_2=1e-6 Gamma inverse scale parameter lambda_1=1e-6 Gamma shape parameter lambda_2=1e-6 Gamma inverse scale parameter "EN" ElastiNet Elastic Net alpha=1.0 Dampening parameter rho Mixing parameter in [0,1] max_iter=300 Maximum iterations tol Optimization tolerance "ENC" ElasticNetCV EN w/Cross Validation rho Dampening parameter(s) eps=1e-3 min(alpha)/max(alpha) n_alphas Number of alphas alphas List of alphas max_iter Maximum iterations tol Optimization tolerance cv=3 Cross validation folds "LA" Lars Least Angle Regression n_nonzero_coefs Number of non-zero coefficients eps Cholesky regularization "LAC" LarsCV LAR w/Cross Validation max_iter Maximum iterations cv=5 Cross validation folds max_n_alphas Max points for residuals in cv "LAS" Lasso Least Absolute Shrinkage and Selection Operator alpha=1.0 Dampening parameter max_iter Maximum iterations tol Optimization tolerance "LASC" LassoCV LAS w/Cross Validation eps=1e-3 min(alpha)/max(alpha) n_alphas Number of alphas alphas List of alphas max_iter Maximum iterations tol Optimization tolerance cv=3 Cross validation folds "LL" LassoLars Lasso and Lars model max_iter Maximum iterations eps Cholesky regularization "LLC" LassoLarsCV LL w/Cross Validation max_iter Maximum iterations cv=5 Cross validation folds max_n_alphas Max points for residuals in cv eps Cholesky regularization "LLIC" LassoLarsIC LL w/AIC or BIC criterion "AIC" or "BIC" criterion max_iter Maximum iterations eps Cholesky regularization "OMP" OrthogonalMatchingPursuit n_nonzero_coefs Number of non-zero coefficients tol Max residual norm (instead of non-zero coef) Local methods Key Description --- ----------- "LS" Ordenary Least Squares "T" Ridge Regression/Tikhonov Regularization order Order of regularization (or custom matrix) alpha Dampning parameter (else estimated from gcv) "TC" T w/Cross Validation order Order of regularization (or custom matrix) alpha Dampning parameter (else estimated from gcv) Returns ------- R[, uhat] R : Poly Fitted polynomial with `R.shape=u.shape[1:]` and `R.dim=D`. uhat : np.ndarray The Fourier coefficients in the estimation. Examples -------- >>> P = cp.Poly([1, x, y]) >>> x = [[-1,-1,1,1], [-1,1,-1,1]] >>> u = [0,1,1,2] >>> print fit_regression(P, x, u) 0.5q1+0.5q0+1.0 """ x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, *x.shape) u = np.array(u) Q = P(*x).T shape = u.shape[1:] u = u.reshape(u.shape[0], np.prod(u.shape[1:])) rule = rule.upper() # Local rules if rule == "LS": uhat = la.lstsq(Q, u)[0] elif rule == "T": uhat = rlstsq(Q, u, kws.get("order", 0), kws.get("alpha", None), False) elif rule == "TC": uhat = rlstsq(Q, u, kws.get("order", 0), kws.get("alpha", None), True) else: # Scikit-learn wrapper try: _ = lm except: raise NotImplementedError("sklearn not installed") if rule == "BARD": solver = lm.ARDRegression(fit_intercept=False, copy_X=False, **kws) elif rule == "BR": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.BayesianRidge(**kws) elif rule == "EN": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.ElasticNet(**kws) elif rule == "ENC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.ElasticNetCV(**kws) elif rule == "LA": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.Lars(**kws) elif rule == "LAC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.LarsCV(**kws) elif rule == "LAS": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.Lasso(**kws) elif rule == "LASC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.LassoCV(**kws) elif rule == "LL": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.LassoLars(**kws) elif rule == "LLC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.LassoLarsCV(**kws) elif rule == "LLIC": kws["fit_intercept"] = kws.get("fit_intercept", False) solver = lm.LassoLarsIC(**kws) elif rule == "OMP": solver = lm.OrthogonalMatchingPursuit(**kws) uhat = solver.fit(Q, u).coef_ u = u.reshape(u.shape[0], *shape) R = po.sum((P * uhat.T), -1) R = po.reshape(R, shape) if retall == 1: return R, uhat elif retall == 2: return R, uhat, Q return R
def regress_sys(folder, all_videos, yfit, training_size, randselect=True, trainingdata=[], frame=0, have_output=True, download=True, bucket_name='ccurtis.data'): """Uses regression based on image intensities to select tracking parameters. This function uses regression methods from the scikit-learn module to predict the lower quality cutoff values for particle filtering in TrackMate based on the intensity distributions of input images. Currently only uses the first frame of videos for analysis, and is limited to predicting quality values. In practice, users will run regress_sys twice in different modes to build a regression system. First, set have_output to False. Function will return list of randomly selected videos to include in the training dataset. The user should then manually track particles using the Trackmate GUI, and enter these values in during the next round as the input yfit variable. Parameters ---------- folder : str S3 directory containing video files specified in all_videos. all_videos: list of str Contains prefixes of video filenames of entire video set to be tracked. Training dataset will be some subset of these videos. yfit: numpy.ndarray Contains manually acquired quality levels using Trackmate for the files contained in the training dataset. training_size : int Number of files in training dataset. randselect : bool If True, will randomly select training videos from all_videos. If False, will use trainingdata as input training dataset. trainingdata : list of str Optional manually selected prefixes of video filenames to be used as training dataset. have_output: bool If you have already acquired the quality values (yfit) for the training dataset, set to True. If False, it will output the files the user will need to acquire quality values for. bucket_name : str S3 bucket containing videos to be downloaded for regression calculations. Returns ------- regress_object : list of sklearn.svm.classes. Contains list of regression objects assembled from the training datasets. Uses the mean, 10th percentile, 90th percentile, and standard deviation intensities to predict the quality parameter in Trackmate. tprefix : list of str Contains randomly selected images from all_videos to be included in training dataset. """ if randselect: tprefix = [] for i in range(0, training_size): random.seed(i + 1) tprefix.append(all_videos[random.randint(0, len(all_videos))]) if have_output is False: print("Get parameters for: {}".format(tprefix[i])) else: tprefix = trainingdata if have_output is True: # Define descriptors descriptors = np.zeros((training_size, 4)) counter = 0 for name in tprefix: local_im = name + '.tif' remote_im = "{}/{}".format(folder, local_im) if download: aws.download_s3(remote_im, local_im, bucket_name=bucket_name) test_image = sio.imread(local_im) descriptors[counter, 0] = np.mean(test_image[frame, :, :]) descriptors[counter, 1] = np.std(test_image[frame, :, :]) descriptors[counter, 2] = np.percentile(test_image[frame, :, :], 10) descriptors[counter, 3] = np.percentile(test_image[frame, :, :], 90) counter = counter + 1 # Define regression techniques xfit = descriptors classifiers = [ svm.SVR(), linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] regress_object = [] for item in classifiers: clf = item regress_object.append(clf.fit(xfit, yfit)) return regress_object else: return tprefix
#Choose predictors predictors = features #Clean the data tr_data = CleanHousingData(tr_data) #Split the data with the folds kf = KFold(n_splits=3, random_state=1, shuffle=True) for train_index, test_index in kf.split(tr_data): trainsplit = tr_data.iloc[train_index, :] testsplit = tr_data.iloc[test_index, :] #Finding out which algorithm adjusts better to the data #Create the algorithm dictionary ARD = linear_model.ARDRegression() LinRe = linear_model.LinearRegression() SGD = linear_model.SGDRegressor() BR = linear_model.BayesianRidge() Lars = linear_model.Lars() Lasso = linear_model.Lasso() PA = linear_model.PassiveAggressiveRegressor() RANSAC = linear_model.RANSACRegressor() Theil = linear_model.TheilSenRegressor() Gboost = ensemble.GradientBoostingRegressor() algorithms = { 'Linear Regression': LinRe, 'Bayesian ARD regression': ARD, 'BayesianRidge': BR, 'Lars': Lars, 'Lasso': Lasso,
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None): from sklearn import datasets, neighbors, linear_model, svm totalTime = 0 startTrainTime = time() logger.info("Start training...") if model_type == 'ARDRegression': model = linear_model.ARDRegression().fit(train_x, train_y) elif model_type == 'BayesianRidge': model = linear_model.BayesianRidge().fit(train_x, train_y) elif model_type == 'ElasticNet': model = linear_model.ElasticNet().fit(train_x, train_y) elif model_type == 'ElasticNetCV': model = linear_model.ElasticNetCV().fit(train_x, train_y) elif model_type == 'HuberRegressor': model = linear_model.HuberRegressor().fit(train_x, train_y) elif model_type == 'Lars': model = linear_model.Lars().fit(train_x, train_y) elif model_type == 'LarsCV': model = linear_model.LarsCV().fit(train_x, train_y) elif model_type == 'Lasso': model = linear_model.Lasso().fit(train_x, train_y) elif model_type == 'LassoCV': model = linear_model.LassoCV().fit(train_x, train_y) elif model_type == 'LassoLars': model = linear_model.LassoLars().fit(train_x, train_y) elif model_type == 'LassoLarsCV': model = linear_model.LassoLarsCV().fit(train_x, train_y) elif model_type == 'LassoLarsIC': model = linear_model.LassoLarsIC().fit(train_x, train_y) elif model_type == 'LinearRegression': model = linear_model.LinearRegression().fit(train_x, train_y) elif model_type == 'LogisticRegression': model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'LogisticRegressionCV': model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'MultiTaskLasso': model = linear_model.MultiTaskLasso().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNet': model = linear_model.MultiTaskElasticNet().fit(train_x, train_y) elif model_type == 'MultiTaskLassoCV': model = linear_model.MultiTaskLassoCV().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNetCV': model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuit': model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuitCV': model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y) elif model_type == 'PassiveAggressiveClassifier': model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'PassiveAggressiveRegressor': model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y) elif model_type == 'Perceptron': model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RandomizedLasso': model = linear_model.RandomizedLasso().fit(train_x, train_y) elif model_type == 'RandomizedLogisticRegression': model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y) elif model_type == 'RANSACRegressor': model = linear_model.RANSACRegressor().fit(train_x, train_y) elif model_type == 'Ridge': model = linear_model.Ridge().fit(train_x, train_y) elif model_type == 'RidgeClassifier': model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeClassifierCV': model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeCV': model = linear_model.RidgeCV().fit(train_x, train_y) elif model_type == 'SGDClassifier': model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SGDRegressor': model = linear_model.SGDRegressor().fit(train_x, train_y) elif model_type == 'TheilSenRegressor': model = linear_model.TheilSenRegressor().fit(train_x, train_y) elif model_type == 'lars_path': model = linear_model.lars_path().fit(train_x, train_y) elif model_type == 'lasso_path': model = linear_model.lasso_path().fit(train_x, train_y) elif model_type == 'lasso_stability_path': model = linear_model.lasso_stability_path().fit(train_x, train_y) elif model_type == 'logistic_regression_path': model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'orthogonal_mp': model = linear_model.orthogonal_mp().fit(train_x, train_y) elif model_type == 'orthogonal_mp_gram': model = linear_model.orthogonal_mp_gram().fit(train_x, train_y) elif model_type == 'LinearSVC': model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SVC': model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y) else: raise NotImplementedError('Model not implemented') logger.info("Finished training.") endTrainTime = time() trainTime = endTrainTime - startTrainTime logger.info("Training time : %d seconds" % trainTime) logger.info("Start predicting train set...") train_pred_y = model.predict(train_x) logger.info("Finished predicting train set.") logger.info("Start predicting test set...") test_pred_y = model.predict(test_x) logger.info("Finished predicting test set.") endTestTime = time() testTime = endTestTime - endTrainTime logger.info("Testing time : %d seconds" % testTime) totalTime += trainTime + testTime train_pred_y = np.round(train_pred_y) test_pred_y = np.round(test_pred_y) np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i') logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y))) logger.info('[TEST] Acc: %.3f' % (accuracy_score(test_y, test_pred_y))) return accuracy_score(test_y, test_pred_y)
# quantile_transformer = preprocessing.QuantileTransformer( # output_distribution='normal', random_state=42, n_quantiles=73) # X_trans = quantile_transformer.fit_transform(X) # plt.hist(z) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 5) lr = linear_model.LinearRegression() lasso = linear_model.Lasso() ridge = linear_model.RidgeCV() bayard = linear_model.ARDRegression() # bayridge = line_mode.BayesianRidge() models = [lr, lasso, ridge, bayard] for model in models: print(model) model.fit(X_train, y_train) print(model.score(X_test, y_test)) print(model.intercept_) print(model.coef_) # efs = EFS(lr, ### best subset index given 1, 2, 4,6,9,10,11,13,14 -> 'eFG%', 'OppeFG%' ,'ORB%','OppDRB%', 'DRB%', 'TOV%', 'OppTOV%', 'STL%', 'OppPF' , second run through can drop OppDRB% as its the inverse of ORB%
# ## Bayesian Ridge Regression Bayesreg = linear_model.BayesianRidge() Bayesreg_model_fit = model_fit(Bayesreg, 'Bayesian_Ridge_Regression', X_train, y_train, X_cv, y_cv, X_test, y_test, features_name, train, cv, test) coef = pd.DataFrame(Bayesreg.coef_, index=features_name, columns=['features_importance']) coef.sort_index(ascending=False, inplace=True) print(coef.head(10).round(6)) coef.to_csv(para.path_results + "features_importance_Bayesreg.csv") # ## ARD Regression ardreg = linear_model.ARDRegression() ardreg_model_fit = model_fit(ardreg, 'ARD_Regression', X_train, y_train, X_cv, y_cv, X_test, y_test, features_name, train, cv, test) coef = pd.DataFrame(ardreg.coef_, index=features_name, columns=['features_importance']) coef.sort_index(ascending=False, inplace=True) print(coef.head(10).round(6)) coef.to_csv(para.path_results + "features_importance_ardreg.csv") # ## TheilSen Regression theilsenreg = linear_model.TheilSenRegressor() theilsenreg_model_fit = model_fit(theilsenreg, 'TheilSen_Regression',
regression(linear_model.HuberRegressor()), regression(linear_model.ElasticNet(random_state=RANDOM_SEED)), regression(linear_model.ElasticNetCV(random_state=RANDOM_SEED)), regression(linear_model.TheilSenRegressor(random_state=RANDOM_SEED)), regression(linear_model.Lars()), regression(linear_model.LarsCV()), regression(linear_model.Lasso(random_state=RANDOM_SEED)), regression(linear_model.LassoCV(random_state=RANDOM_SEED)), regression(linear_model.LassoLars()), regression(linear_model.LassoLarsIC()), regression(linear_model.OrthogonalMatchingPursuit()), regression(linear_model.OrthogonalMatchingPursuitCV()), regression(linear_model.Ridge(random_state=RANDOM_SEED)), regression(linear_model.RidgeCV()), regression(linear_model.BayesianRidge()), regression(linear_model.ARDRegression()), regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)), regression( linear_model.PassiveAggressiveRegressor(random_state=RANDOM_SEED)), # Logistic Regression classification( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification_binary(
def regress_sys(folder, all_videos, y, training_size, have_output=True): """ Uses regression techniques to select the best tracking parameters. Regression again intensities of input images. Parameters ---------- all_videos: list Contains prefixes of video filenames of entire video set to be tracked. Training dataset will be some subset of these videos. y: numpy array Contains manually acquired quality levels using Trackmate for the files contained in the training dataset. training_size: int Number of files in training dataset. have_output: boolean If you have already acquired the quality values (y) for the training dataset, set to True. If False, it will output the files the user will need to acquire quality values for. Returns ------- regress_object: list of sklearn regression objects. Contains list of regression objects assembled from the training datasets. Uses the mean, 10th percentile, 90th percentile, and standard deviation intensities to predict the quality parameter in Trackmate. """ tprefix = [] for i in range(0, training_size): random.seed(i + 1) tprefix.append(all_videos[random.randint(0, len(all_videos))]) if have_output is False: print("Get parameters for: {}".format(tprefix[i])) if have_output is True: # Define descriptors descriptors = np.zeros((training_size, 4)) counter = 0 for name in tprefix: pup = name.split('_')[0] local_im = name + '.tif' remote_im = "{}/{}/{}".format(folder, pup, local_im) aws.download_s3(remote_im, local_im) test_image = sio.imread(local_im) descriptors[counter, 0] = np.mean(test_image[0, :, :]) descriptors[counter, 1] = np.std(test_image[0, :, :]) descriptors[counter, 2] = np.percentile(test_image[0, :, :], 10) descriptors[counter, 3] = np.percentile(test_image[0:, :, :], 90) counter = counter + 1 # Define regression techniques X = descriptors classifiers = [ svm.SVR(), linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] regress_object = [] for item in classifiers: clf = item regress_object.append(clf.fit(X, y)) return regress_object
from sklearn import preprocessing from sklearn import utils lab_enc = preprocessing.LabelEncoder() y_train_encoded = lab_enc.fit_transform(y_train) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) classifiers = { 'SVR':svm.SVR(), 'SVC':SVC(), 'SGD':linear_model.SGDRegressor(), 'BAYES':linear_model.BayesianRidge(), 'LL':linear_model.LassoLars(), 'ARD':linear_model.ARDRegression(), 'PA':linear_model.PassiveAggressiveRegressor(), 'TS':linear_model.TheilSenRegressor(), 'L':linear_model.LinearRegression() } train_scores = [] test_scores = [] names = [] models = {} for key in classifiers.keys(): clf = classifiers[key] clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) y_test_predict = clf.predict(X_test)
def __init__(self): ''' Class constructor or initialization method. ''' # keys and tokens from the Twitter Dev Console consumer_key = 'wELRpStXm3ClfLm1bmFNnHylH' consumer_secret = 'FHpTU0BBClgULhOMFrp2QyjaMcFg9LDWaNO2buyTQJ0WUtxyvW' access_token = '1236399499565608961-UtDzGjrLbcRevxCJRX2gAIv9s5HIhV' access_token_secret = 'MscQlrcL0vtGPBxct09tXTVxgwQD70UnOxEs0bY19X7yD' # attempt authentication try: # create OAuthHandler object self.auth = OAuthHandler(consumer_key, consumer_secret) # set access token and secret self.auth.set_access_token(access_token, access_token_secret) # create tweepy API object to fetch tweets self.api = tweepy.API(self.auth) except: print("Error: Authentication Failed") # creating object of TwitterClient Class # api = TwitterClient() # calling function to get tweets wSent = ["WSENT"] aSent = ["ASENT"] for index in range(3,8): day = datetime.date.today() - datetime.timedelta(days = index) wTweets = self.get_tweets(query = 'weather', count = 100, geocode='41.2565,-96.05,5mi', until=day) aTweets = self.get_tweets(query = '', count = 100, geocode='41.2565,-96.05,5mi', until = day) ptweets = [tweet for tweet in wTweets if tweet['sentiment'] == 'positive'] ntweets = [tweet for tweet in wTweets if tweet['sentiment'] == 'negative'] netPosSent = (len(ptweets)/len(wTweets)) - (len(ntweets)/len(wTweets)) wSent.append(netPosSent) ptweets = [tweet for tweet in aTweets if tweet['sentiment'] == 'positive'] ntweets = [tweet for tweet in aTweets if tweet['sentiment'] == 'negative'] netPosSent = (len(ptweets)/len(aTweets)) - (len(ntweets)/len(aTweets)) aSent.append(netPosSent) # print(wSent) # print(aSent) url = "https://www.ncei.noaa.gov/orders/cdo/2069913.csv" dataset = pandas.read_csv(url) dataset = dataset.drop(['STATION', 'NAME', 'DATE'], axis = 1) dataset['WSENT'] = wSent[1:] # dataset['ASENT'] = aSent[1:] dataset = dataset.dropna() # print(dataset.shape) classifiers = [ svm.SVR(), linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] trainingData = dataset.drop(['WSENT'], axis=1) trainingScores = dataset['WSENT'] predictionData = dataset.drop(['WSENT'], axis=1) global clf for item in classifiers: # print(item) clf = item clf.fit(trainingData, trainingScores) print(clf.predict(predictionData),'\n')