def OLSregression(y, x, n=0): x0 = sm.add_constant(x) m1 = sm.OLS(y, x0).fit() print(m1.summary()) print(np.std(m1.predict() - np.array(y))) fig = plt.figure(figsize=(10, 5)) ax = fig.add_subplot(111) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.plot(y.index, pd.Series(m1.predict()), label='fitted', c=pick_a_color()) plt.plot(y.index, y, label='actual', c=pick_a_color()) plt.legend(loc='best') plt.title('OLS') plt.show() if n == 0: return m1.params else: m2=en(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10],\ l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x, y) print(m2.intercept_, m2.coef_) print(np.std(m2.predict(x) - np.array(y))) fig = plt.figure(figsize=(10, 5)) ax = fig.add_subplot(111) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.plot(y.index, pd.Series(m2.predict(x)), label='fitted', c=pick_a_color()) plt.plot(y.index, y, label='actual', c=pick_a_color()) plt.legend(loc='best') plt.title('Elastic Net') plt.show() return [m2.intercept_] + [item for item in m2.coef_]
def get_estimator(): """Build your estimator here.""" eln = en(alpha=1, l1_ratio=0.05, fit_intercept=True, normalize=False, precompute=False, max_iter=10000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=123, selection='random') estimator = make_pipeline(VBMFeatureExtractor(), StandardScaler(), eln) return estimator
x.append(fam_panEFM[v]['freq_reactions'][r]) sel = np.random.choice(np.arange(1000), replace=False, size=200) x = np.array(x).T for m in fam_panEFM[v]['y_metabolome']: y.append(fam_panEFM[v]['freq_metabolite_use'][m]) mets.append(m) y = np.array(y).T for r in fam_panEFM[v]['reac_met_freq_association']: z.append(fam_panEFM[v]['freq_mod_reactions'][r]) z = np.array(z) from sklearn.linear_model import MultiTaskElasticNetCV as en enet = en(cv=2, n_jobs=5, verbose=1, max_iter=10000) EN = enet.fit(x[sel], y[sel]) p = EN.predict(z.reshape(1, -1)).flatten() d = {mets[i]: p[i] for i in range(len(mets))} metab_d[v] = d.copy() en_res = {} for fam in metab_d: for m in metab_d[fam]: if m not in en_res: en_res[m] = np.zeros(len(families)) for i, fam in enumerate(families): for m in metab_d[fam]:
#lets denote data from 2017-4-25 to 2018-4-25 as backtesting window/testing set x0 = pd.concat([df['usd'], df['gbp'], df['eur'], df['brent']], axis=1) x1 = sm.add_constant(x0) x = x1[x1.index < '2017-04-25'] y = df['nok'][df.index < '2017-04-25'] model = sm.OLS(y, x).fit() print(model.summary(), '\n') # In[4]: #nevertheless, from the summary u can tell there is multicollinearity #the condition number is skyrocketing #alternatively, i can use elastic net regression to achieve the convergence m = en(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x0[x0.index < '2017-04-25'], y) print(m.intercept_, m.coef_) #elastic net estimation results: #3.79776228406 [ 0.00388958 0.01992038 0.02823187 0.00050092] # In[5]: #we plot the difference between two different approaches #note that the difference is negative skewed df['sk_fit'] = (df['usd'] * m.coef_[0] + df['gbp'] * m.coef_[1] + df['eur'] * m.coef_[2] + df['brent'] * m.coef_[3] + m.intercept_) df['ols_fit'] = (df['usd'] * model.params[1] + df['gbp'] * model.params[2] + df['eur'] * model.params[3] + df['brent'] * model.params[4] +
#shuffle the whole dataset X_r, X_d, y_r, y_d = train_test_split(X, y, test_size=0, random_state=0) max_score_tra = 0 para_tra = None max_score_cv = 0 para_cv = None for precompute in [True, False]: for fit_intercept in [True, False]: for normalize in [True, False]: for copy_X in [True, False]: for warm_start in [True, False]: for positive in [True, False]: clf = en(precompute=precompute, fit_intercept = fit_intercept, normalize = normalize, copy_X = copy_X, warm_start = warm_start, positive = positive, random_state = 0) clf1 = en(precompute=precompute, fit_intercept = fit_intercept, normalize = normalize, copy_X = copy_X, warm_start = warm_start, positive = positive, random_state = 0) clf.fit(X_train, y_train) score_tra = clf.score(X_test, y_test) score_cv = cross_val_score(clf1, X_r, y_r, cv=5) if score_tra > max_score_tra: max_score_tra = score_tra para_tra = clf.get_params if score_cv.mean() > max_score_cv: max_score_cv = score_cv.mean() para_cv = clf1.get_params # y_pred = clf.predict(X_test) # print clf.get_params # print clf.scores_ print max_score_tra
br = '\n' X = np.load('data/X_boston.npy') y = np.load('data/y_boston.npy') X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) regressors = [ lr(), bay(), rr(alpha=.5, random_state=0), l(alpha=0.1, random_state=0), ll(), knn(), ard(), rfr(random_state=0, n_estimators=100), SVR(gamma='scale', kernel='rbf'), rcv(fit_intercept=False), en(random_state=0), dtr(random_state=0), ada(random_state=0), gbr(random_state=0) ] print('unscaled:', br) for reg in regressors: reg.fit(X_train, y_train) rmse, name = get_error(reg, X_test, y_test) name = reg.__class__.__name__ print(name + '(rmse):', end=' ') print(rmse) print() print('scaled:', br) scaler = StandardScaler() X_train_std = scaler.fit_transform(X_train)
reducers_cfg[PCA.__name__] = dict( reducer__n_components=[], # reducer__whiten = [True, False], reducer__svd_solver=['auto']) reducers_cfg[GenericUnivariateSelect.__name__] = dict( reducer__score_func=[f_regression], reducer__mode=['k_best'], reducer__param=[]) reducers_cfg[RFE.__name__] = dict(reducer__n_features_to_select=[], reducer__step=[0.1]) ######################### ####### Models ########## ######################### # models = [br(), en(), ls(), lo(), ll()] models = [br(), en(), ll()] models_cfg = {} def init(para=None): # print (para); if para == 2: ######################### ####Data Preprocessor ### ######################### preprocessors = [DummyTransformer] preprocessors_cfg = {} preprocessors_cfg[DummyTransformer.func.__name__] = {}
oof_ridge_383[val_idx] = ridge_383.predict(X_train_383[val_idx]) predictions_ridge_383 += ridge_383.predict(X_test_383) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_ridge_383, target))) folds = KFold(n_splits=5, shuffle=True, random_state=13) oof_en_383 = np.zeros(train_shape) predictions_en_383 = np.zeros(len(X_test_383)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)): print("fold n°{}".format(fold_ + 1)) tr_x = X_train_383[trn_idx] tr_y = y_train[trn_idx] # ElasticNet 弹性网络 en_383 = en(alpha=1.0, l1_ratio=0.06) en_383.fit(tr_x, tr_y) oof_en_383[val_idx] = en_383.predict(X_train_383[val_idx]) predictions_en_383 += en_383.predict(X_test_383) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_en_383, target))) folds = KFold(n_splits=5, shuffle=True, random_state=13) oof_br_383 = np.zeros(train_shape) predictions_br_383 = np.zeros(len(X_test_383)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)): print("fold n°{}".format(fold_ + 1)) tr_x = X_train_383[trn_idx] tr_y = y_train[trn_idx]