def multi_collinearity_test(data, flag=0): """ When flag=0: Return values of the determinant of eigenvalues of df. The value describe the multi collinearity level. The larger value the more unlikely multi collinearity. * 0 = perfect collinearity, 1 = no collinearity When flag=1: When flag=2: Return variance inflation factor. It quantifies the severity of multicollinearity in an ordinary least squares regression analysis A rule of thumb for interpreting the variance inflation factor: * 1 = not correlated. * Between 1 and 5 = moderately correlated. * Greater than 5 = highly correlated """ corr = np.corrcoef(data, rowvar=0) if flag == 0: res = np.linalg.det(corr) elif flag == 1: res = np.linalg.eig(corr) elif flag == 2: from statsmodels.stats.outliers_influence import variance_inflation_factor as vif if isinstance(data, pd.core.frame.DataFrame): res = { c: vif(data.values, data.columns.get_loc(c)) for c in data.columns } else: res = {i: vif(data, i) for i, v in enumerate(data.T)} else: raise ValueError('flag {} is not defined.'.format(flag)) return res
def get_single_vif(group, RHS): dmatrix = patsy.dmatrix(formula_like=RHS, data=group) vifs = { name: vif(dmatrix, index) for name, index in dmatrix.design_info.column_name_indexes.items() } return pd.Series(vifs)
def compute_vif(self): """Compute variance inflation factors for all input variables.""" vifs = dict() for ind, col in enumerate(self.X_train.columns): vif_score = vif(np.matrix(self.X_train), ind) vifs[col] = vif_score.round(2) return vifs
def vif(self): """Determine the Variance Inflation Factor (vif) of the coefficients and return a dataframe of the vif's.""" vif_out = pd.DataFrame() predictors = np.array(self.predictors) vif_out["VIF Factor"] = [ vif(predictors, i) for i in range(predictors.shape[1]) ] vif_out["features"] = self.predictors.columns return vif_out
def vif_test(X): vd, vd_out = dd(list), {} for i, n in enumerate(X.names): try: vd_out[n] = round(vif(X.array, i), 4) except: vd_out[n] = 0.0 return vd_out
def vif(self): '''Computes variance influence factors for each feature variable''' import statsmodels.api as sm from statsmodels.stats.outliers_influence import variance_inflation_factor as vif lm = sm.OLS(self.target_, sm.add_constant(self.features_)).fit() for i in range(self.features_.shape[1]): v = vif(np.matrix(self.features_), i) print("Variance inflation factor for feature {}: {}".format( i, round(v, 2)))
def vif_coeffs(M): ''' Description: Compute VIF on every column of the matrix M Input arugments: * M: 2D np.ndarray Return: * A list of VIF for each column. Size of list is equal to no. of columns. ''' return [vif(M, idx) for idx in range(M.shape[1])]
def _collinear_vif(self): """ Check for collinear features """ for ind in range(self.X.shape[1]): value = vif(self.X, ind) if value > self.vifMagnitude: print self.header[ind] + ' has vif ' + str(value) self.collinear = True if self.collinear: raise Exception('Collinear feature risk')
def get_vif(): ''' Calculates Variance Inflation Factor for each feature in a dataframe. :return: Pandas dataframe of VIF scores for each feature. ''' df, categorical_mappings, config = read_data() y = df[config['outcome_feature']] X = df[[i for i in df.columns if i not in config['outcome_feature']]] X = sm.add_constant(X) sm.OLS(y, X).fit().summary() vif_scores = [vif(X.values, i) for i in range(X.shape[1])] return pd.concat([pd.Series(X.columns), pd.Series(vif_scores)], axis=1).rename(columns={0: 'column', 1: 'vif'})
def cal_vif(df, vif_columns): """ 计算VIF """ vif_df = df.loc[:, vif_columns].fillna(-999) columns = vif_df.columns.tolist() vif_ma = vif_df.as_matrix() result = {} for k, v in enumerate(columns): result[v] = vif(vif_ma, k) vif_result = pd.Series(result, name='vif') vif_result.index.name = 'variable' vif_result = vif_result.reset_index() return (vif_result)
def calculate(self,X): stop = False while not stop: columns = X.columns scores = np.array([vif(X[columns].values,columns.get_loc(col)) for col in columns]) if scores.max()>self.thresh: max_index = scores.argmax() max_col = columns[max_index] X = X.drop(max_col,axis=1) continue else: stop = True return columns
def vif(self): """Computes variance influence factors for each feature variable""" if not self.is_fitted: print("Model not fitted yet!") return None import statsmodels.api as sm from statsmodels.stats.outliers_influence import ( variance_inflation_factor as vif, ) lm = sm.OLS(self.target_, sm.add_constant(self.features_)).fit() for i in range(self.features_.shape[1]): v = vif(np.matrix(self.features_), i) print("Variance inflation factor for feature {}: {}".format( i, round(v, 2)))
def get_vif(X): """ Takes a pd.DataFrame or 2D np.array and prints Variance Inflation Factor for every variable. """ if isinstance(data, pd.DataFrame) == False: X = pd.DataFrame(X) X['__INTERCEPT'] = np.ones(X.shape[0]) for i in range(X.shape[1] - 1): the_vif = vif(X.values, i) print("VIF for column {:03}: {:.02f}".format(i, the_vif))
def _collinear_vif(df, thresh=5.): """ Check for collinear features """ x = df.values dropped = set([]) for i in range(x.shape[1]): ind = i - len(dropped) value = vif(x, ind) print ind, value, x.shape if value > thresh: dropped.add(df.columns[i]) x = np.delete(x, ind, 1) return df[[i for i in df if i not in dropped]]
def compute(self, data, columns): """Checks for multicolinearity of the dataset by using variance inflation factor. Args: data: your dataset columns: not in use; exist due to structural consistency Returns: """ print("computing variance_inflation_factor") results = {} for i in range(data.shape[1]): results[data.columns[i]] = vif(data.values, i) print(sorted(results.items(), key=operator.itemgetter(1))) # returns dataset due to structural consistency return data
def check_multicollinearity(df_exogs, add_constant=False): ''' Evaluate Variance Inflation Factors and Conditional Index. Returns VIF DataFrame ''' if add_constant: df = df_exogs.copy() df['const'] = 1 else: df = df_exogs vif_df = pd.DataFrame({ 'var' : df.columns,\ 'VIF' : [vif(df.values, i) for i in range(df.shape[1])] }) ci = np.linalg.cond(df.values) print(f'Condition index: {round(ci, 2)}') return vif_df
def colineary_analysis(dim_vars, correlation_coef, path, new_indexes): # ck=np.vstack((np.ones(dim_vars.T.shape[0]),dim_vars.T.transpose())).transpose() # vif_results = [vif(ck, i) for i in range(ck.shape[1])] for cf in correlation_coef: independent = np.vstack((np.ones( dim_vars.T[new_indexes.get_local_inner_indices()].shape[0]), dim_vars.T[:, 0])) independent = np.vstack( (independent, dim_vars.T[new_indexes.get_local_inner_indices(), -1])) ind_ind = np.array([0, dim_vars.T.shape[1]]) independent_t = independent.transpose() for i in range(1, dim_vars.T.shape[1] - 2): prueba = np.vstack( (independent_t.transpose(), dim_vars.T[new_indexes.get_local_inner_indices(), i])).transpose() vif_r = vif(prueba, prueba.shape[1] - 1) if vif_r > 1. / (1. - cf): continue else: independent_t = prueba ind_ind = np.append(ind_ind, i) ind_ind.sort() print '----------------------------------------' print 'Analysis of colinearity' print 'For a correlation coefficient %f' % (cf) print 'the non-colinear snapshots are' print ind_ind print '----------------------------------------' np.savetxt( path + 'independent_snapschots_correlation_0_' + str(cf) + '.dat', ind_ind)
#基本的に説明変数が増えれば増えるほど、重回帰式の精度は高くなると紹介しましたが、それだけがいいことばかりとは限りません。 #当てはめの精度は高いのに、予測精度が低くなることを過学習(オーバーフィッティング)と言います。 #過学習になる原因は、『手持ちデータ』に過剰に適合しすぎたモデルを構築してしまったことです。 #こうなると、いまある『検証用データには当てはまりが良い』が『予測したい新しいデータに回帰式を当てはめると、 #当てはまりが悪くなる』といった減少が起きてしまいます。 #過学習を回避するためには一般的に次に紹介する『クロスバリデーション法』を用います。 #『回帰式を求める分析用のデータ』と、『その当てはまりの良さを確認するためのデータ』の2パターンを用意します。 #今回、重回帰分析用に使用したデータセットには、 #回帰式を求める『train.csv』と当てはまりの良さを確認する『test.csv』の2つが用意されているので、test.csvを使います。 ## 多重共線性(マルチコ) #多重共線性とは、説明変数間で非常に強い相関があることを指し、この値が大きいと回帰係数の分散が大きくなり、モデルの予測結果が悪くなることが知られています。 #ただし、重回帰分析を行う目的が『因果関係の洞察』ではなく、『予測』であれば、気にしなくて大丈夫です。 ##summary()の結果でいう、Cond. No.が多重共線性をチェックする指標になります。 #ただし、重回帰分析を行う目的が『因果関係の洞察』ではなく、『予測』であれば、気にしなくて大丈夫です。 # #summary()の結果でいう、Cond. No.が多重共線性をチェックする指標になります。 from statsmodels.stats.outliers_influence import variance_inflation_factor as vif num_cols = model.exog.shape[1] # 説明変数の列数 vifs = [vif(model.exog, i) for i in range(0, num_cols)] pd.DataFrame(vifs, index=model.exog_names, columns=['VIF']) # #一般的にVIFの値が10(公式のリファレンスでは、5)を超えると、依存関係が強いため、適切な重回帰分析ができないと言われています。 # #今回でいうと、ダミー変数化で作成した『week』の列のVIF値がすべて『inf』となっており、依存関係が非常に強いです。 # #繰り返しになりますが、重回帰分析の目的が『因果関係の洞察』であれば説明変数から除外したほうが無難であり、『予測』が目的であれば除外しなくても大丈夫です。
import numpy as np r2score = r2_score(data['sales'], Y_pred) print(r2score) rmse = np.sqrt(mean_squared_error(data['sales'], Y_pred)) print(rmse) # In[52]: from statsmodels.stats.outliers_influence import variance_inflation_factor as vif ind_df = data.iloc[:, :-1] vif_df = pd.DataFrame() vif_df["features"] = ind_df.columns vif_df["VIF Factor"] = [vif(ind_df.values, i) for i in range(ind_df.shape[1])] vif_df.round(2) # In[15]: import statsmodels.formula.api as sm # create a fitted model with two features lm_model = sm.ols(formula='sales ~ TV + radio ', data=data).fit() # print the coefficients print(lm_model.params) print(lm_model.summary()) # In[55]:
plt.show() # Based on the Cook's Distance plot, **there are few data points with residuals possibly being outliers.** # **Variance Inflation Factor (VIF)** # # The VIF of eacb predictor allows us to check which factors to a degree cause multicollinearity in our model by dividing the ratio of variance in our multi-linear model by the variance of a simple-linear model. # In[31]: from statsmodels.stats.outliers_influence import variance_inflation_factor as vif # In[32]: for i in range(len(best_feats.columns)): v = vif(np.matrix(best_feats), i) print('Variance Inflation Factor for {}: {}'.format( best_feats.columns[i], round(v, 2))) # It seems that two factors in our model, **length of stay** and **available facilities & services**, have VIFs > 10. This means **there is multicollinearity in our model.** # # Prediction & their Intervals # # To test our model on patients with IDs 1-5, we shall create a new dataframe with just those rows, and get predictions from our model # In[33]: # Create test data from patient ids 1-5 with best features test_data = pd.DataFrame(raw_data[:5], columns=[ 'length of stay', 'routine culturing',
# 5. Assess # Compared actual price vs predicted price to check accuracy using R-squared #ols_model.summary() # Implementation of variance inflation factor for multicollinearity removal from statsmodels.stats.outliers_influence import variance_inflation_factor as vif type(X) # Convert dataframe to ndarray x_array = X.values type(x_array) # Implementation of multi-collinearity removal for i in range(len(independent)): mvif = [ vif(X[independent].values, index) for index in range(len(independent)) ] max_vif = max(mvif) dindex = mvif.index(max_vif) #print("Index", dindex, "MaxVIF", max_vif, "Column", independent[dindex]) if max_vif > 10: independent = independent.delete(dindex) #print(independent) Y = data["price"] X_new = data[independent] ols_model_1 = sm.OLS(Y, X_new).fit() ols_model_1.summary() predict_price_1 = ols_model_1.predict(X_new)
model = model.fit(X , Y ) print("The slope(m) of equation is", model.coef_) print("The intercept/residue (c) is", model.intercept_) Ypred = model.predict(X) from sklearn.metrics import r2_score, mean_absolute_error , mean_squared_error r2_score(Y, Ypred) from statsmodels.stats.outliers_influence import variance_inflation_factor as vif for i in range(len(independent)): vif_list = [vif(data[independent].values, index) for index in range(len(independent))] mvif = max(vif_list) print("Max VIF value:",mvif) drop_index = vif_list.index(mvif) if mvif > 10: print("deleting", independent[drop_index]) independent = independent.delete(drop_index) print("Final Independent Variables", independent) import statsmodels.api as sm Y = data["price"] X = data[independent] model = sm.OLS(Y,X) model = model.fit() model.summary()
'RMSE by Linear Regression: ', numerical.sqrt( metrics.mean_squared_error(testDataY, predictionByLinearRegression))) variableForStatisticX = statistic.add_constant(trainDataX) print(variableForStatisticX.head()) variableForEST = statistic.OLS(trainDataY, variableForStatisticX) variableForESTVisualization = variableForEST.fit() print(variableForESTVisualization.summary()) VIFS = [ vif(variableForStatisticX.values, i) for i in range(len(variableForStatisticX.columns)) ] matrix.Series(data=VIFS, index=variableForStatisticX.columns) ############ PART 2. GMDH FOR REGRESSION ANALYSIS ############ GMDH = MultilayerGMDH(ref_functions='linear') GMDHModel = GMDH.fit(trainDataX, trainDataY) predictionByGMDH = GMDH.predict(testDataX) figures.scatter(testDataY, predictionByGMDH)
# Variance inflation factors # In[21]: # don't forget to add constant if the ols model includes intercept df_exog = sm.add_constant(df.drop('medv', axis = 1)) # too fancy for printing results? for i, col in enumerate(df.columns): if col == 'const': pass elif len(col) > 6: print(col, ':', "{0:.2f}".format(vif(df_exog.as_matrix(), i))) else: print(col, '\t:', "{0:.2f}".format(vif(df_exog.as_matrix(), i))) # Run a regression excluding *age* predictor (formula = 'medv ~ . - age') # In[22]: lm = smf.ols(formula = ols_formula(df, 'medv', 'age'), data = df) lm_fit = lm.fit() lm_fit.summary() # ### 3.6.4 Interaction Terms
import pandas as pd from sklearn.linear_model import LinearRegression as lm import statsmodels.formula.api as smf from patsy import dmatrices from statsmodels.stats.outliers_influence import variance_inflation_factor as vif df = pd.read_csv("bike.csv") df.head() features = "+".join(df.columns[1:-3]) y, X = dmatrices("casual ~ " + features, df, return_type = "dataframe") df_vif = pd.DataFrame() df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])] df_vif["features"] = X.columns df_vif model1 = smf.ols("casual ~ " + features, data = df) print(model1.fit().summary()) X_df = df.iloc[:, 1:-3] model2 = lm().fit(X_df, y) model2.predict(X_df.iloc[:3, :])
# Fit the Ordinary Least Squared Regression Model import statsmodels.api as sm model = sm.OLS(Y, X) # Train the model model = model.fit() # Check the model summary model.summary() # Calculate variance inflation factor from statsmodels.stats.outliers_influence import variance_inflation_factor as vif for i in range(len(independent_variables)): vif_list = [ vif(data[independent_variables].values, index) for index in range(len(independent_variables)) ] mvif = max(vif_list) print("Max VIF value is", mvif) drop_index = vif_list.index(mvif) print("For the Independent variable", independent_variables[drop_index]) if mvif > 10: print("Deleting", independent_variables[drop_index]) independent_variables = independent_variables.delete(drop_index) print("Final Independent Variables", independent_variables) Y = data["price"] X = data[independent_variables] model = sm.OLS(Y, X) model = model.fit()
corr = round(dataWOE.corr(),2) mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True plt.figure(figsize = (5, 5)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(corr, mask=mask, cmap=cmap, center=0, annot =True, cbar_kws={"shrink": .5}) plt.show() """选择方差共线性<10的变量""" col = np.array(data[short_list_2]) from statsmodels.stats.outliers_influence import variance_inflation_factor as vif for i in range(len(short_list_2)): print ('{} VIF是{}'.format(short_list_2[i], vif(col, i))) """判断显著性""" X = data[short_list_2] X['intercept'] = [1]*X.shape[0] y = data['target'] import statsmodels.api as sm lr_sm=sm.Logit(y, X).fit() lr_sm.summary() '''建模'''
def judge_vif(X): from statsmodels.stats.outliers_influence import variance_inflation_factor as vif vif_data = pd.DataFrame([]) vif_data["VIF_Factor"] = [vif(X.values, i) for i in range(X.shape[1])] vif_data["features"] = X.columns return vif_data
def colin_test(): #Test the collinearity of the logistic equations by using VFE from sklearn.metrics import r2_score from scipy.stats import spearmanr #BARRA logit = LogisticRegression(class_weight="balanced", solver="liblinear") pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\ "barra_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\ is_pss="hss", model_name="barra_fc_v3") #Convective AWS event = "is_conv_aws" preds = ["eff_lcl", "U1", "sb_cape", "lr13", "rhmin03", "lr36", "eff_cin"] vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))] logit_mod = logit.fit(df_aws[preds], df_aws[event]) df1 = pd.DataFrame({ "VIF": vifs, "coefs": np.squeeze(logit_mod.coef_) }, index=preds) preds = ["eff_lcl", "U1", "sb_cape", "lr13", "rhmin03", "eff_cin"] vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))] logit_mod = logit.fit(df_aws[preds], df_aws[event]) df2 = pd.DataFrame({ "VIF": vifs, "coefs": np.squeeze(logit_mod.coef_) }, index=preds) preds = ["ml_el", "Umean06", "lr36", "rhmin13", "dcape"] vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))] logit_mod = logit.fit(df_aws[preds], df_aws[event]) df3 = pd.DataFrame({ "VIF": vifs, "coefs": np.squeeze(logit_mod.coef_) }, index=preds) preds = ["ml_el", "Umean06", "rhmin13", "dcape"] vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))] logit_mod = logit.fit(df_aws[preds], df_aws[event]) df4 = pd.DataFrame({ "VIF": vifs, "coefs": np.squeeze(logit_mod.coef_) }, index=preds) (pd.concat([df1, df2], axis=1)).to_csv( "/g/data/eg3/ab4502/ExtremeWind/skill_scores/vif_barra_aws.csv", float_format="%.2e") print(pd.concat([df1, df2], axis=1)) #Test CV HSS scores #preds = ["eff_lcl","U1","sb_cape","lr13","rhmin03","eff_cin"] #barra_aws = logit_predictor_test("barra", "is_conv_aws", preds, "t_totals", 16) #STA preds = ["ml_cape", "Umean06", "eff_lcl", "scld"] vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))] logit_mod = logit.fit(df_aws[preds], df_aws[event]) df1 = pd.DataFrame({ "VIF": vifs, "coefs": np.squeeze(logit_mod.coef_) }, index=preds) print(df1) #ERA5 pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\ "era5_allvars_v2_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\ is_pss="hss", model_name="era5") #Convective AWS preds = ["ml_el", "Umean03", "eff_lcl", "dpd700", "lr36", "rhmin01"] vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))] logit_mod = logit.fit(df_aws[preds], df_aws[event]) df1 = pd.DataFrame({ "VIF": vifs, "coefs": np.squeeze(logit_mod.coef_) }, index=preds) preds = ["ml_el", "Umean03", "eff_lcl", "dpd700", "rhmin01"] vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))] logit_mod = logit.fit(df_aws[preds], df_aws[event]) df2 = pd.DataFrame({ "VIF": vifs, "coefs": np.squeeze(logit_mod.coef_) }, index=preds) (pd.concat([df1, df2], axis=1)).to_csv( "/g/data/eg3/ab4502/ExtremeWind/skill_scores/vif_era5_aws.csv", float_format="%.2e") print(pd.concat([df1, df2], axis=1)) #Test CV HSS scores #preds = ["ml_el","Umean03","eff_lcl","dpd700","rhmin01"] #era5_aws = logit_predictor_test("era5", "is_sta", preds, "t_totals", 16) #STA preds = ["ml_cape", "Umean06", "srhe_left", "lr13"] vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))] logit_mod = logit.fit(df_aws[preds], df_aws[event]) df1 = pd.DataFrame({ "VIF": vifs, "coefs": np.squeeze(logit_mod.coef_) }, index=preds) print(df1)
# In[105]: import seaborn as sns corr_df = X.corr(method="pearson") print(corr_df) sns.heatmap(corr_df, vmax=1.0, vmin=-1.0, annot=True) # In[106]: from statsmodels.stats.outliers_influence import variance_inflation_factor as vif vif_df = pd.DataFrame() vif_df['features'] = X.columns vif_df['VIF Factor'] = [vif(X.values, i) for i in range(X.shape[1])] vif_df.round(2) # In[107]: from sklearn.model_selection import train_test_split #split the data into test and train X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10) # In[108]: from sklearn.linear_model import LinearRegression