def Stepwise_Forward_Selection(Data, Inputs, Output): Model_var1 = sm.OLS X = Data[Inputs] y = Data[Output] initial_list = [] threshold_in = 0.05 verbose = True included = list(initial_list) while True: changed = False excluded = list(set(X.columns) - set(included)) new_pval = pd.Series(index=excluded) for new_column in excluded: model = Model_var1( y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit() new_pval[new_column] = model.pvalues[new_column] best_pval = new_pval.min() if best_pval < threshold_in: best_feature = new_pval.argmin() included.append(best_feature) changed = True if verbose: print('Add {:30} with p-value {:.6}'.format( best_feature, best_pval)) if not changed: break return included
def test_summary_col_ordering_preserved(self): # gh-3767 x = [1, 5, 7, 3, 5] x = add_constant(x) x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1) x2 = pd.DataFrame(x2, columns=['const', 'b', 'a']) y1 = [6, 4, 2, 7, 4] y2 = [8, 5, 0, 12, 4] reg1 = OLS(y1, x2).fit() reg2 = OLS(y2, x2).fit() info_dict = { 'R2': lambda x: '{:.3f}'.format(int(x.rsquared)), 'N': lambda x: '{0:d}'.format(int(x.nobs)) } original = actual = summary_col([reg1, reg2], float_format='%0.4f') actual = summary_col([reg1, reg2], regressor_order=['a', 'b'], float_format='%0.4f', info_dict=info_dict) variables = ('const', 'b', 'a') for line in str(original).split('\n'): for variable in variables: if line.startswith(variable): assert line in str(actual)
def SUR_model(type): from linearmodels.system import SUR from collections import OrderedDict import statsmodels.regression.linear_model as smlm Equation = OrderedDict() for i in range(34): x_lag1 = np.nan * np.ones(X_data.shape[0]) y_lag1 = np.nan * np.ones(X_data.shape[0]) x_lag1[1:] = X_data.iloc[:-1, i] y_lag1[1:] = y_data.iloc[:-1, i] y_reg = y_data.iloc[:, i] y_reg.name = 'netflow_' + str(i) X_exo = pd.concat([pd.Series(y_lag1), pd.Series(x_lag1)], axis=1) X_exo = smlm.add_constant(X_exo) # X_exo = X_exo.iloc[1:, :] X_exo.columns = ['const', 'netflow_lag1', 'panic'] name = 'Platform_' + str(i) Equation[name] = {'dependent': y_reg, 'exog': X_exo} reg = SUR(Equation).fit(method=type) print(reg) return reg, Equation
def test_summarycol(self): # Test for latex output of summary_col object desired = r''' \begin{table} \caption{} \begin{center} \begin{tabular}{lcc} \hline & y I & y II \\ \midrule \midrule const & 7.7500 & 12.4231 \\ & (1.1058) & (3.1872) \\ x1 & -0.7500 & -1.5769 \\ & (0.2368) & (0.6826) \\ \hline \end{tabular} \end{center} \end{table} ''' x = [1, 5, 7, 3, 5] x = add_constant(x) y1 = [6, 4, 2, 7, 4] y2 = [8, 5, 0, 12, 4] reg1 = OLS(y1, x).fit() reg2 = OLS(y2, x).fit() actual = summary_col([reg1, reg2]).as_latex() actual = '\n%s\n' % actual assert_equal(desired, actual)
def test_summarycol(self): # Test for latex output of summary_col object desired = r''' \begin{table} \caption{} \begin{center} \begin{tabular}{lcc} \hline & y I & y II \\ \midrule \midrule const & 7.7500 & 12.4231 \\ & (1.1058) & (3.1872) \\ x1 & -0.7500 & -1.5769 \\ & (0.2368) & (0.6826) \\ \hline \end{tabular} \end{center} \end{table} ''' x = [1,5,7,3,5] x = add_constant(x) y1 = [6,4,2,7,4] y2 = [8,5,0,12,4] reg1 = OLS(y1,x).fit() reg2 = OLS(y2,x).fit() actual = summary_col([reg1,reg2]).as_latex() actual = '\n%s\n' % actual assert_equal(desired, actual)
def stepwise_selection(X, y, initial_list=[], threshold_in=0.01, threshold_out=0.05, verbose=True): """ Perform a forward-backward feature selection based on p-values""" included = list(initial_list) while True: changed = False # forward step excluded = list(set(X.columns) - set(included)) new_pval = pd.Series(index=excluded) for new_column in excluded: model = sm.OLS( y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit() new_pval[new_column] = model.pvalues[new_column] best_pval = new_pval.min() if best_pval < threshold_in: best_feature = new_pval.idxmin() included.append(best_feature) changed = True if verbose: print('Add {:30} with p-value {:.6}'.format( best_feature, best_pval)) # backward step model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit() # use all coefs except intercept pvalues = model.pvalues.iloc[1:] worst_pval = pvalues.max() # null if pvalues is empty if worst_pval > threshold_out: changed = True worst_feature = pvalues.argmax() included.remove(worst_feature) if verbose: print('Drop {:30} with p-value {:.6}'.format( worst_feature, worst_pval)) if not changed: break return included
def test_OLSsummary(self): # Test that latex output of regular OLS output still contains # multiple tables x = [1,5,7,3,5] x = add_constant(x) y1 = [6,4,2,7,4] reg1 = OLS(y1,x).fit() with warnings.catch_warnings(): warnings.simplefilter("ignore") actual = reg1.summary().as_latex() string_to_find = r'''\end{tabular} \begin{tabular}''' result = string_to_find in actual assert(result is True)
def compute_QTL_gti_peaki(datapoint): [peak_sample, gt_sample, weight_sample] = datapoint valid_samples = np.where(gt_sample!= -1)[0] y = np.array(peak_sample[valid_samples]) y = y.astype(float) x = np.array(gt_sample[valid_samples]) x_weights = np.array(weight_sample[valid_samples]) x = sm.add_constant(x) wls_model = sm.WLS(y, x, weights = x_weights) results = wls_model.fit() return results.pvalues[1]
def test_OLSsummary(self): # Test that latex output of regular OLS output still contains # multiple tables x = [1, 5, 7, 3, 5] x = add_constant(x) y1 = [6, 4, 2, 7, 4] reg1 = OLS(y1, x).fit() with warnings.catch_warnings(): warnings.simplefilter("ignore") actual = reg1.summary().as_latex() string_to_find = r'''\end{tabular} \begin{tabular}''' result = string_to_find in actual assert (result is True)
def test_summarycol_drop_omitted(self): # gh-3702 x = [1, 5, 7, 3, 5] x = add_constant(x) x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1) y1 = [6, 4, 2, 7, 4] y2 = [8, 5, 0, 12, 4] reg1 = OLS(y1, x).fit() reg2 = OLS(y2, x2).fit() actual = summary_col([reg1, reg2], regressor_order=['const', 'x1'], drop_omitted=True) assert 'x2' not in str(actual) actual = summary_col([reg1, reg2], regressor_order=['x1'], drop_omitted=False) assert 'const' in str(actual) assert 'x2' in str(actual)
def test_summary_col_ordering_preserved(self): # gh-3767 x = [1, 5, 7, 3, 5] x = add_constant(x) x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1) y1 = [6, 4, 2, 7, 4] y2 = [8, 5, 0, 12, 4] reg1 = OLS(y1, x2).fit() reg2 = OLS(y2, x2).fit() info_dict = {'R2': lambda x: '{:.3f}'.format(int(x.rsquared)), 'N': lambda x: '{0:d}'.format(int(x.nobs))} original = actual = summary_col([reg1, reg2], float_format='%0.4f') actual = summary_col([reg1, reg2], regressor_order=['x2', 'x1'], float_format='%0.4f', info_dict=info_dict) variables = ('const', 'x1', 'x2') for line in str(original).split('\n'): for variable in variables: if line.startswith(variable): assert line in str(actual)
def iteration(e0, depth): y_star = y_data Wald_iter_df = pd.DataFrame(index=range(depth), columns=y_star.columns) for d in range(depth): e = e0.sample(frac=1).reset_index(drop=True) for i in range(34): ei = e.iloc[:, i] y_star_lag = pd.Series(index=y_star.index) y_star_lag[1:] = y_star.iloc[:-1, i] y_star.iloc[1:, i] = ei + Params_df.loc[ 'const', i] + Params_df.loc[0, i] * y_star_lag[1:] # 生成y-star # y_star.iloc[0, i] = y_data.iloc[0, i] x_lag1 = pd.Series(index=y_star.index) y_lag1 = pd.Series(index=y_star.index) x_lag1[1:] = X_data.iloc[:-1, i] y_lag1[1:] = y_star.iloc[:-1, i] y_reg = y_star.iloc[:, i] X_exo = pd.concat([y_lag1, x_lag1], axis=1) X_exo = lm.add_constant(X_exo) model = lm.OLS(y_reg, X_exo, missing='drop') res = model.fit() R = np.eye(len(res.params))[2] # print(res.wald_test(R).fvalue[0][0]) # Wald_iter_df.iloc[d, i] = res.wald_test(R).fvalue[0][0] try: wald_i = res.wald_test(R).fvalue[0][0] except ValueError: Wald_iter_df.iloc[d, i] = np.nan print(d, i, "Appear") # print(X_exo) else: Wald_iter_df.iloc[d, i] = wald_i return Wald_iter_df
def test_demo(): Wald_test_list = [] e0 = pd.DataFrame(index=X_data.index, columns=y_data.columns) for i in range(34): x_lag1 = np.nan * np.ones(X_data.shape[0]) y_lag1 = np.nan * np.ones(X_data.shape[0]) x_lag1[1:] = X_data.iloc[:-1, i] y_lag1[1:] = y_data.iloc[:-1, i] y_reg = y_data.iloc[:, i] X_exo = pd.concat([pd.Series(y_lag1), pd.Series(x_lag1)], axis=1) X_exo = lm.add_constant(X_exo) model = lm.OLS(y_reg, X_exo, missing='drop') res = model.fit() R = np.eye(len(res.params))[2] print(res.params) print(R) print(res.wald_test(R)) Params_df.iloc[:, i] = res.params Wald_test_list.append(res.wald_test(R).fvalue[0][0]) # 计算初始样本的F-value e0.iloc[:, i] = y_reg - res.params[ 'const'] - res.params[0] * y_lag1 # 长度为34,计算残值 return e0, Wald_test_list
#============================================================================== #============================================================================== # 선형 회귀 진단 # 1. 잔차의 정규성 검정 # 2. 잔차의 자기상관계수 검정 # 3. 독립변수에 대한 condition number 계산 #============================================================================== from sklearn.datasets import make_regression from statsmodels.regression import linear_model as sm import pandas as pd X0, y, coef = make_regression(n_samples=100, n_features=1, noise=20, coef=True, random_state=0) #noise : standard deviation of the gaussian noise applied to the output. dfX0 = pd.DataFrame(X0, columns=['X1']) dfX = sm.add_constant(dfX0) # 독립변수 앞에 1을 넣어서 절편용 열을 하나 만들어준다. dfy = pd.DataFrame(y, columns=['y']) model = sm.OLS(dfy, dfX) result = model.fit() print(result.summary()) # 다중공선성(독립변수중 어떤 변수는 다른 변수들로 설명 가능한 경우, full rank가 안되게 되어 회귀계수 추정에 문제가 발생함)을 해결하는 방법 # 1. 변수 선택법으로 의존적인 변수 제거 # 2. PCA로 새로운 변수를 추출 # 3. regulize 방법론 적용 (Lasso, Ridge, ElasticNet 등) from statsmodels.datasets.longley import load_pandas import seaborn as sns import matplotlib.pyplot as plt
# Use sds to make vector of noise. eta = random.randn(len(s)) * sds # In MatLabversion: eta = randn(size(s)).*sds; # Pretend some data points are extreme. eta[7]=-1 eta[8]=-3 eta[10]=-3 # Find observed values of x (with noise added). x = m*s + c + eta # Weighted Least Squares regression. We could find the solution using the smallest value in the Farray below also. # Find weightings w (discount) for each data point. vars0 = sds ** 2 w=1/vars0 # In MatLabversion: w=1./vars0 #w=ones(size(w)) # un-comment this line to get solution based on uniform noise terms. ss = sm.add_constant(s) # In MatLabversion: ss=[ones(size(s)) s] # prepend column of ones so that solution includes intercept term. model = sm.WLS(ss, x, weights=w) # In MatLabversion: [params,stdx,mse,S] =lscov(ss,x,w) results = model.fit() mest2, cest2 = results.params[0] cest2 = cest2 + c # +c to compensate for apparent incorrect intercept returned from WLS ######## xest2 = mest2 * s + cest2 # In MatLabversion: xest2 = mest2.*s + cest2; c0 = cest2; m0 = mest2; # Plot fitted line xest (=xhat in text) and data points. fig1 = plt.figure() # ; clf; plt.plot(s,x, 'k*', s, xest2, 'k') plt.xlabel('Salary, ' + r'$s$' + ' (groats)'); # use trick to get italic font plt.ylabel('Height, ' + r'$x$' + ' (feet)'); plt.xlim((0,12))
sds = sds * arange(1, 12) / 10.0 # Use noise values copied from book (based on sds above). eta = [ -0.0023, -0.0728, 0.1104, 0.6076, -0.3034, -0.2237, 0.7407, -1.0, -3.0, -2.4653, -3.0 ] # Find observed values of x (with noise added). x = m * s + c + eta # Weighted Least Squares regression. # Find weightings w (discount) for each data point. vars0 = sds**2 w = 1 / vars0 # Un-comment next line for solution based on un-weighted regression. #w=ones(size(w)) ss = sm.add_constant(s) # Add column of 1s for regression. model = sm.WLS(x, ss, weights=w) results = model.fit() cest2, mest2 = results.params print('Estimated slope = %.3f,' % mest2) print(' estimated intercept = %.3f.' % cest2) # Make line xest2 based on fitted slope and intercept. s2 = arange(0, 13) xest2 = mest2 * s2 + cest2 # Plot fitted line xest, data points, and error bars. fig1 = plt.figure() plt.errorbar(s, x, yerr=sds, fmt='o', color='k') plt.plot(s, x, 'k*', s2, xest2, 'k--') plt.xlabel('Salary, $s$ (groats)')
WeekDayNo_dummies = pd.get_dummies(df['WeekDayNo']).rename(columns=lambda x: 'WeekDayNo_' + str(x)) WeekDayNo_dummies = pd.DataFrame(WeekDayNo_dummies) df = pd.concat([df, WeekDayNo_dummies], axis=1) df = df.drop('WeekDayNo',axis=1) Event_dummies = pd.get_dummies(df['Event']).rename(columns=lambda x: 'Event_' + str(x)) Event_dummies = pd.DataFrame(Event_dummies) df = pd.concat([df, Event_dummies], axis=1) df = df.drop('Event',axis=1) df_predictor = df[['Holiday','WeekDayNo_1','WeekDayNo_2','WeekDayNo_3','WeekDayNo_4','WeekDayNo_5','WeekDayNo_6','WeekDayNo_7']] y_target = df['DailyLongTerm_Vessey'] df_predictor = lm.add_constant(df_predictor) df_fit, df_eval, y_fit, y_eval= train_test_split( df_predictor, y_target, test_size=.2, random_state=1 ) ols_model = lm.OLS(y_fit,df_fit,).fit() prediction = ols_model.predict(df_eval) print(ols_model.summary()) prediction = pd.DataFrame(prediction) prediction.columns = ['predicted_values'] y_eval = y_eval.reset_index(drop=True) y_eval.columns = ['DailyLongTerm_Vessey'] RMSE = mean_squared_error(y_eval, prediction)**0.5 result_compare = pd.concat([prediction,y_eval], axis=1) print(result_compare,RMSE)
def model(model_name, train_x, train_y, test_x,alpha = 0.1,): summary = None from sklearn.neural_network import MLPRegressor from sklearn.svm import SVR import sklearn import statsmodels.regression.linear_model as sm if model_name == 'Random': test_y = pd.Series(np.random.random_sample((len(test_x),)), index=test_x.index) if model_name == 'None': test_y = test_x.iloc[:,0] if model_name == 'MLPRegressor': mlp = MLPRegressor(hidden_layer_sizes=(20, 20)) mlp.fit(train_x, train_y) y_pred = mlp.predict(test_x) test_y = pd.Series(y_pred, index=test_x.index) if model_name == 'Lasso': model = sklearn.linear_model.Lasso(0.001,fit_intercept = False) lasso = model.fit(train_x, train_y) test_y = pd.Series(lasso.predict(test_x), index=test_x.index) summary = lasso.score(train_x,train_y) if model_name == 'Ridge': model = sklearn.linear_model.Ridge(1.0,fit_intercept = False) ridge = model.fit(train_x, train_y) test_y = pd.Series(ridge.predict(test_x), index=test_x.index) summary = ridge.score(train_x, train_y) if model_name == 'SVR': svr_rbf = SVR(kernel='rbf', C=1, gamma=0.0001, epsilon=0.1) svr_rbf.fit(train_x, train_y) y_pred_rbf = svr_rbf.predict(test_x) test_y = pd.Series(y_pred_rbf, index=test_x.index) if model_name == 'StepWise': feature_col = list(train_x.columns.values) length = len(feature_col) final_feature = [] for i in range(length): pvalue_min = 1 column_min = "" for feature in feature_col: temp_feature = final_feature + [feature] x = sm.add_constant(train_x.loc[:,temp_feature]) model = sm.OLS(train_y, x) pvalue = model.fit().pvalues[i + 1] # print(pvalue) if pvalue < pvalue_min and pvalue < alpha: pvalue_min = pvalue column_min = feature if column_min != "": feature_col.remove(column_min) final_feature.append(column_min) else: break X = sm.add_constant(train_x.loc[:,final_feature]) model = sm.OLS(train_y, X) res = model.fit() summary = pd.Series(res.pvalues, index=['const'] + final_feature) if ~np.isnan(res.f_pvalue): summary['f_test'] = res.f_pvalue if ~np.isnan(res.rsquared_adj): summary['score'] = res.rsquared_adj xx = sm.add_constant(test_x.loc[:,final_feature],has_constant='raise') test_y = res.predict(xx) if model_name == 'AdaBoost': from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor(n_estimators=100,learning_rate = 0.5) adaboost = model.fit(train_x,train_y) test_y = pd.Series(adaboost.predict(test_x),index = test_x.index) if model_name == 'RandomForestRegressor': from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor(n_estimators=100, criterion='mse',max_features='auto') rfr.fit(train_x, train_y) y_pred_rfr = rfr.predict(test_x) test_y = pd.Series(y_pred_rfr, index=test_x.index) return test_y,summary
def __model(model_name, train_x, train_y, test_x,alpha = 0.1,*args,**kwargs): summary = None from sklearn.neural_network import MLPRegressor from sklearn.svm import SVR import sklearn import statsmodels.regression.linear_model as sm from sklearn.model_selection import TimeSeriesSplit cv = TimeSeriesSplit(n_splits=3) if model_name == 'Random': test_y = pd.Series(np.random.random_sample((len(test_x),)), index=test_x.index) if model_name == 'None': test_y = test_x.iloc[:,0] if model_name == 'MLPRegressor': mlp = MLPRegressor(hidden_layer_sizes=(20, 20)) mlp.fit(train_x, train_y) y_pred = mlp.predict(test_x) test_y = pd.Series(y_pred, index=test_x.index) if model_name == 'Lasso': model = sklearn.linear_model.Lasso(0.001,fit_intercept = False) lasso = model.fit(train_x, train_y) test_y = pd.Series(lasso.predict(test_x), index=test_x.index) summary = lasso.score(train_x,train_y) # print(test_y.head()) # model = sklearn.linear_model.Lasso() # param_grid = {'alpha':[1e-5,0.5*1e-4,1e-4,1e-3,1e-2,1e-1]} # opt = sklearn.model_selection.GridSearchCV(model,param_grid,cv=cv) # opt = opt.fit(train_x,train_y) # test_y = pd.Series(opt.predict(test_x),index=test_x.index) # summary = opt.score(train_x,train_y) # print(opt.best_params_, summary) if model_name == 'Ridge': model = sklearn.linear_model.Ridge(1.0,fit_intercept = False) ridge = model.fit(train_x, train_y) test_y = pd.Series(ridge.predict(test_x), index=test_x.index) summary = ridge.score(train_x, train_y) if model_name == 'SVR': # param_grid = {'gamma':list(1.0/k*np.array([1e-4,1e-3,1e-2])),\ # 'C':[0.01,0.05,0.25,1.25]} # param_grid = { # 'C':[0.002,0.01,0.05,0.25,1.25]} # opt = sklearn.model_selection.GridSearchCV(svr_rbf,param_grid,cv=cv) # opt = opt.fit(train_x,train_y) # y_pred_rbf = opt.predict(test_x) # summary = opt.score(train_x,train_y) # print(opt.best_params_, summary) k = len(train_x.columns) svr_rbf = SVR(kernel='rbf', C=0.05, gamma=1.0/k*1e-4,epsilon = 0.005, max_iter = 5000) svr_rbf = svr_rbf.fit(train_x, train_y) y_pred_rbf = svr_rbf.predict(test_x) test_y = pd.Series(y_pred_rbf, index=test_x.index) summary = svr_rbf.score(train_x,train_y) # print(test_y.head()) if model_name == 'StepWise': feature_col = list(train_x.columns.values) length = len(feature_col) final_feature = [] for i in range(length): pvalue_min = 1 column_min = "" for feature in feature_col: temp_feature = final_feature + [feature] x = sm.add_constant(train_x.loc[:,temp_feature]) model = sm.OLS(train_y, x) pvalue = model.fit().pvalues[i + 1] # print(pvalue) if pvalue < pvalue_min and pvalue < alpha: pvalue_min = pvalue column_min = feature if column_min != "": feature_col.remove(column_min) final_feature.append(column_min) else: break X = sm.add_constant(train_x.loc[:,final_feature]) model = sm.OLS(train_y, X) res = model.fit() summary = pd.Series(res.pvalues, index=['const'] + final_feature) if ~np.isnan(res.f_pvalue): summary['f_test'] = res.f_pvalue if ~np.isnan(res.rsquared_adj): summary['score'] = res.rsquared_adj xx = sm.add_constant(test_x.loc[:,final_feature],has_constant='raise') test_y = res.predict(xx) if model_name == 'AdaBoost': from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor(n_estimators=100,learning_rate = 0.1) adaboost = model.fit(train_x,train_y) test_y = pd.Series(adaboost.predict(test_x),index = test_x.index) summary = adaboost.score(train_x,train_y) if model_name == 'RandomForestRegressor': from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor(n_estimators=100, criterion='mse',max_features='auto') rfr.fit(train_x, train_y) y_pred_rfr = rfr.predict(test_x) test_y = pd.Series(y_pred_rfr, index=test_x.index) return test_y,summary
# -*- coding: utf-8 -*- """ Name : c7_03_random_OLS.py Book : Python for Finance (2nd ed.) Publisher: Packt Publishing Ltd. Author : Yuxing Yan Date : 6/6/2017 email : [email protected] [email protected] """ import numpy as np import scipy as sp import statsmodels.regression.linear_model as sm n = 100 sp.random.seed(12345) y = [1, 2, 3, 4, 2, 3, 4] x1 = range(1, 8) x2 = [4, 2, -1, 4, 2, 3, 5] x3 = [0, 2, 3, 4, 2, 4, -1] x = [x1, x2, x3] x = sm.add_constant(x) #est = sm.OLS(formula='Sales ~ TV + Radio', data=df_adv).fit() results = sm.OLS(y, x1).fit() print(results.summary())
#%% #FIRST DATASET REGRESSIONS #Regression by Asset Class on all Indicators start_date, end_date = "2000 01 01", "2008 01 01" Y_assets = data_returns(asset_classes, start_date, end_date, "M", 1) X_macro = data_lagged( macro_data[[ "Monetary Policy", "International Trade", "Risk Sentiment", "Growth", "Inflation" ]], start_date, end_date, "M", 1) # regression on each asset class for asset in Y_assets.columns: res = sm.OLS(Y_assets[asset], sm.add_constant(X_macro)).fit() print(res.summary()) #%% start_date, end_date = "1990 01 01", "2008 01 01" Y_assets = data_returns(asset_classes, start_date, end_date, "M", 1) X_macro = data_lagged( macro_data[[ "Monetary Policy", "International Trade", "Risk Sentiment", "Growth", "Inflation" ]], start_date, end_date, "M", 1) #Regression Per Indicator for asset in Y_assets.columns:
xx = np.linspace(-10, 10, 1000) plt.plot(xx, (1 / (1 + np.exp(-xx))) * 2 - 1, label='logistic (scaled)') plt.plot(xx, np.tanh(xx), label='tanh') plt.legend(loc=2) plt.show() #sklearn logistic regression from sklearn.datasets import make_classification import statsmodels.regression.linear_model as sm X0, y = make_classification(n_features=1, n_redundant=0, n_informative=1, n_clusters_per_class=1, random_state=4) X = sm.add_constant(X0) from sklearn.linear_model import LogisticRegression model = LogisticRegression().fit(X0, y) import matplotlib as mpl xx = np.linspace(-3, 3, 100) sigm = 1.0 / (1 + np.exp(-model.coef_[0][0] * xx - model.intercept_[0])) plt.plot(xx, sigm) plt.scatter(X0, y, marker='o', c=y, s=100) #(X0,y) 점들을 찍고, 형태는 o이고, 색은 y값에 따라, 사이즈는 100 plt.scatter(X0, model.predict(X0), marker='x', c=y, s=200,