def vif(): global final_html global df,df_train,df_test,test_train_created,origin_df chi_key = list() if request.method == 'POST': try: Listkey1 = list(MultiDict(request.form).values()) df1 = df for key1 in Listkey1: if(key1 <> "Calculate VIF"): chi_key.append(key1) df1=df.loc[:,chi_key] df2 = df1.values temp_count = 0 vif_result="" print "chi key",chi_key for key1 in chi_key: k = of.variance_inflation_factor(df2,temp_count) vif_result = vif_result + "<br>" + chi_key[temp_count] + ": " + str(k) temp_count = temp_count + 1 temp_df = df[1:15] final_html = template.s1 + "<br><b> Variance Inflation Factor for selected variables </b><br>"+ vif_result + "<br><br></div>" + temp_df.to_html() return final_html except ValueError: final_html = template.s1 + """<br><font color="lightcoral"> Error. Please select valid values</font> <br><br></div>""" + df[1:15].to_html() return final_html except KeyError: final_html = template.s1 + """<br><font color="lightcoral"> Error. Please upload file and select valid values </font> <br><br></div>""" + df[1:15].to_html() return final_html return 'helloo'
def get_vifs(df): X = sm.add_constant(df) col_num = X.shape[1] df = X.ix[:, 1:] vif_list = [variance_inflation_factor(np.array(X), i) for i in np.arange(1, col_num, 1)] result = Series(vif_list, df.columns) print "VIF of all columns are: \n", result
def VIF(typ, Currency, x_ls, exog_id): reg_df=pd.read_excel(ROOT_DIR + 'cleaned data/regression data/' + typ +'/' + Currency + '_' + typ +'.xlsx') for x in x_ls: reg_df = reg_df[~reg_df[x].isnull()] mx = reg_df[x_ls].as_matrix() return outliers_influence.variance_inflation_factor(mx, exog_id)
def test_collinearity(data, explanatory_variables): data = numpy.array(data) highly_collinear_attr = list() vif_list = list() for attr in explanatory_variables: vif = outliers_influence.variance_inflation_factor(data, explanatory_variables.index(attr)) vif_list.append(vif) if(vif > 5): highly_collinear_attr.append(attr) print('\nVariance Inflation Factors:') print(pandas.DataFrame(vif_list, index=explanatory_variables, columns=['VIF']).T) print('\nHighly collinear features:') print(highly_collinear_attr)
def rm_vif(X): import statsmodels.stats.outliers_influence as smso loop=True indep = X.copy() # print indep.shape while loop: vifs = np.array([smso.variance_inflation_factor(indep.values, i) for i in xrange(indep.shape[1])]) max_vif = vifs[1:].max() # print max_vif, vifs.mean() if max_vif > 30 and vifs.mean() > 10: where_vif = vifs[1:].argmax() + 1 keep = np.arange(indep.shape[1]) != where_vif nms = indep.columns.values[where_vif].encode('utf-8') # only ever length 1, so convert unicode print (bcolors.FAIL + bcolors.UNDERLINE + "\n%s removed due to multicollinearity.\n" + bcolors.ENDC) % nms indep = indep.ix[:, keep] else: loop=False # print indep.shape return indep
def vif(): global final_html global df,origin_df chi_key = list() firstkey = "" if request.method == 'POST': Listkey1 = list(MultiDict(request.form).values()) Listkey2 = MultiDict(request.form) df1 = df for key1 in Listkey1: if(key1 <> "Calculate VIF"): chi_key.append(key1) df1=df.loc[:,chi_key] df2 = df1.values temp_count = 0 vif_result="" for key1 in chi_key: k = of.variance_inflation_factor(df2,temp_count) vif_result = vif_result + "<br>" + chi_key[temp_count] + ": " + str(k) temp_count = temp_count + 1 temp_df = df[1:15] final_html = template.s1 + "</div><br> VIF Results for selected variables <br>"+ vif_result + "<br>" + temp_df.to_html() return final_html return 'helloo'
ACF_resid=tsa.acf(rlt.resid) # Keep ACF of residuals """ 誤差項が定常であれば、モデル内の説明変数と被説明変数との間に安定した(一時的に外れても帰ってくるような)関係があることが保証されます。また、多くの経済変数はそもそも非定常ですので、残差が定常の場合、重要な要因がモデルから脱落している可能性も低くなります。 系列相関以外に大切なのは、多重共線性(マルチコリニアリティ)のチェックでしょう。これは、説明変数の間に強い相関がある場合に生じるもので、推定される係数の符号が反転してしまったりしますので厄介です。 以下のようにVIF統計量を計算して、10を大きく上回っていなければ、ひとまず安心と考えます。また、VIFを参照して機械的に判定しなくても、想定される符号と逆の符号を持った説明変数が現れれば、経験的にマルチコに気づくと思います。もっとも、マルチコの解決策は強相関している説明変数のどれかを取り除くくらいしか解決策がありません。 リッジ回帰など、パラメター空間を制約するやり方はそもそもパラメターの不偏性を犠牲にする上に、必ずしもマルチコを解消させる保障がないため、歪めますので、計量経済学では推奨されていません。 """ # Checking Multicolinearity by VIF VIF=pd.DataFrame([oti.variance_inflation_factor(rlt.model.exog,i) for i in range(1,rlt.model.exog.shape[1])],index=rlt.model.exog_names[1:],columns=['VIF']) # VIF>10 should be cared """ ■ トライ&エラーを補助してくれる可視化ツール 回帰分析にはトライ&エラーが付き物です。むしろ、ほとんどの経済現象は、線形式で完全に記述できるはずがありませんから、色々な回帰式を当てはめたり、サンプル期間変えたりして初めて、経済現象の全体像を掴むことができるのだと思います。 statsmodelsには、こうしたトライ&エラーをサポートする可視化ツールが付いています。以下のモジュールをimportしてみましょう。 """ # /// Graphical Diagnostic Tools /// --------------------------------- import statsmodels.graphics.regressionplots as regplot
Test_X = Test.drop('is_click', axis = 1).copy() Test_Y = Test['is_click'].copy() ######################## # Multicollinearity check ######################## # Check for VIF from statsmodels.stats.outliers_influence import variance_inflation_factor cols_to_drop_vif = [] # All columns with vif value > 10 were earmarked to be dropped from analysis for i in range(Train_X.shape[1]-1): temp_vif = variance_inflation_factor(Train_X.values, i) # Pass Train_X.values and i (col_number) print(Train_X.columns[i], ": ", temp_vif) if(temp_vif>10): print('Since vif value is greater than 10 so dropping the column ',Train_X.columns[i]) cols_to_drop_vif.append(Train_X.columns[i]) Train_X.drop(cols_to_drop_vif, axis=1, inplace=True) Test_X.drop(cols_to_drop_vif, axis=1, inplace=True) ######################## # Feature Scaling ######################## from sklearn.preprocessing import StandardScaler sc = StandardScaler()
lr.fit(X_train,Y_train) Y_pred=lr.predict(X_test) from sklearn.metrics import r2_score rmse=np.sqrt(mean_squared_error(Y_test,Y_pred)) print(rmse) print(r2_score(Y_test,Y_pred)) #To Dig further lets try to remove multicolinearity using pvalues/VIF #First lets try VIF print("Trying VIF") from statsmodels.stats.outliers_influence import variance_inflation_factor ndf=df.drop(['shares'],axis=1) vif = pd.DataFrame() vif["features"] = ndf.columns vif["vif_Factor"] = [variance_inflation_factor(ndf.values, i) for i in range(ndf.shape[1])] Z=vif[~(vif['vif_Factor']>5)] #Let try to see if we remove columns through VIF we get any improvement X=df[Z['features']] Y=df[['shares']] from sklearn.model_selection import train_test_split X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=.20,random_state=85) from sklearn.linear_model import LinearRegression lr=LinearRegression() lr.fit(X_train,Y_train) Y_pred=lr.predict(X_test) from sklearn.metrics import r2_score
#Assessing collinearity using the condition number model_fitted = sm.ols(formula = 'Edad ~ Abertura_A + Abertura_B + Raiz_A + Raiz_B + Superficie_A +Superficie_B ', data=wild_boar_data).fit() # this is the model print model_fitted.summary() #shows OLS regression output #Assessing multicollinearity using the variance inflation factor wb_data = wild_boar_data.as_matrix() #LO HE QUITADO X = wb_data[:,2:] Y = wb_data[:,0] zone = wb_data[:,1] vif_wild_boar = [] for i in range(X.shape[1]): vif_wild_boar.append(vif.variance_inflation_factor(X,i)) print("######################################################################") print("Variance inflation factor") print vif_wild_boar ############################################################################### # # Split train and test # ############################################################################### X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) ############################################################################### # # PCA and PLSR analysis
# **Logistic regression using statsmodels [Model 1]** # In[55]: # Performing logistic regression using stats models X_train_sm = sm.add_constant(X_train[col]) logm1 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial()).fit() logm1.summary() # In[56]: # Calculating the VIF values VIF = pd.DataFrame() VIF['Features'] = X_train[col].columns VIF['VIF'] = [ variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1]) ] VIF['VIF'] = round(VIF['VIF'], 2) VIF = VIF.sort_values(by='VIF', ascending=False) VIF # The VIF Value for all the columns are less than 5. But there are few columns which has high p-values. So based on high p-value, the column `Had a Phone Conversation` should be dropped. # In[57]: # Dropping the column with high p-value col = col.drop('Had a Phone Conversation') col # **Logistic regression using statsmodels [Model 2]**
X_smo, y_smo = smo.fit_sample(X_train, y_train) insample_smo = pd.DataFrame(X_smo).merge(pd.DataFrame(y_smo),left_index=True,right_index=True) insample_smo.columns = ['FICO', 'DTI', 'OLTV', 'Units_234','First Time Homebuyer Flag_Y', 'Occupancy_I', 'Occupancy_S', 'Channel_C', 'Channel_R','Channel_T', 'Property Type_CO', 'Property Type_CP','Property Type_MH', 'Property Type_SF', 'Purpose_C', 'Purpose_N', 'Number of borrowers_1','Default Status'] insample_smo['Default Status'].value_counts()[1]/insample_smo['Default Status'].value_counts().sum() insample_smo = insample_smo.sample(frac=1).reset_index(drop=True) X_train = insample_smo.drop(['Default Status'],axis=1) y_train = insample_smo.loc[:, 'Default Status'] ''' # Multicolinearity check vif = pd.DataFrame() vif["features"] = X_train.columns vif["VIF Factor"] = [ variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1]) ] # Build models '''Logistic Regression''' # Logistic regression with stats logit_model = sm.Logit(y_train, sm.add_constant(X_train)).fit() print('ALL', logit_model.summary()) # Logistic regression with sklearn log = LogisticRegression(random_state=0, solver='lbfgs') modeloutcome(log, X_train, y_train, X_test, y_test) searchthreshold(log, [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) learningcurve(log, X_train, y_train) log.coef_
y_test = df_test.pop('Salary') x_test = df_test x_test_lm = sm.add_constant(x_test) x_test_lm.shape y_pred_lm = lr.predict(x_test_lm) r2_test = r2_score(y_test, y_pred_lm) #VIF from statsmodels.stats.outliers_influence import variance_inflation_factor vif = pd.DataFrame() vif['Features'] = x_train_lm.columns vif['VIF'] = [ variance_inflation_factor(x_train_lm.values, i) for i in range(x_train_lm.shape[1]) ] vif['VIF'] = round(vif['VIF'], 2) vif = vif.sort_values(by="VIF", ascending=False) vif x_train_lm_1 = x_train_lm.drop('Interview_Score', axis=1) lr_1 = sm.OLS(y_train, x_train_lm_1).fit() lr_1.summary() x_test_lm_1 = x_test_lm.drop('Interview_Score', axis=1) y_pred_lm_1 = lr_1.predict(x_test_lm_1) r2_test_1 = r2_score(y_test, y_pred_lm_1) x_train_lm_2 = x_train_lm_1.drop('Test_Marks', axis=1)
print("intersection punto = ", regr.intercept_) print("NO2 = ", regr.coef_[0], "Viento_MAX + ", regr.coef_[1], "T_MAX + ", regr.intercept_) ## NO2 con Viento_Max, T_Max y Lluvia Viento_T_lluvia = data_set[['Lluvia', 'T_MAX', 'Viento_MAX']] regr.fit(Viento_T_lluvia, NO2) print("R cuadrado = ", regr.score(Viento_T_lluvia, NO2)) print("coeff lineal = ", regr.coef_) print("intersection punto = ", regr.intercept_) print("NO2 = ", regr.coef_[2], "Viento_MAX + ", regr.coef_[1], "T_MAX + ", regr.coef_[0], "Lluvia + ", regr.intercept_) ## Multicolinealidad vif = [ variance_inflation_factor(data_set.values, i) for i in range(data_set.shape[1]) ] for i in range(len(vif)): print(vif[i]) # V.I.F. = 1/(1-R^2) ## Polinómico para explicar NO2 for i in range(1, 5): poly = np.polyfit(data_set['NO2'], data_set[['Viento_MAX', 'T_MAX', 'Lluvia']], i) print(poly) # the equations only link one attribute with NO2. We did not manage to find the equation linking NO2 with all the three attributes at the same time ## Regresión múltiple no lineal basado en árboles multArbol = DecisionTreeRegressor() multArbol.fit(Viento_T_lluvia, NO2)
import statsmodels.formula.api as smf from statsmodels.stats.outliers_influence import variance_inflation_factor lr = sm.OLS(y_train, X_train) lr_model = lr.fit() lr_preds = lr_model.predict(X_test) lr_preds.summary() # For each X, calculate VIF and save in dataframe vif = pd.DataFrame() vif["VIF Factor"] = [ variance_inflation_factor(X_train, i) for i in range(X_train.shape[1]) ] vif["features"] = X_train.columns # # #from sklearn.preprocessing import PolynomialFeatures # # #poly = PolynomialFeatures(degree=2) # #X_train = poly.fit_transform(X_train) # #y_train = poly.fit_transform(y_train) from helper import rmse
# plt.scatter(df['collected debris'], q_column['q_val']) # plt.show() x = df[features] #ydf_merged = df['qval'] y = q_column['q_val'] df_xy = df.copy() df_xy['Qval'] = y df_xy.to_csv('C:/Users/ulusan.a/Desktop/RL_rep/RL/data_files/xNy_INS2_V3.csv') # features = [3,4] x = df[features] vif = pd.DataFrame() vif["VIF Factor"] = [ variance_inflation_factor(x.values, i) for i in range(x.shape[1]) ] vif["features"] = x.columns # sc = StandardScaler() # x = sc.fit_transform(x) kf = KFold(n_splits=5, shuffle=True) sum = 0 model = LinearRegression() results = [] i = 0 mse = [] for train, test in kf.split(df):
x1_IV = high_IV_sorted[i][1] y1_IV = high_IV_sorted[j][1] if x1_IV > y1_IV: deleted_index.append(j) else: deleted_index.append(i) multi_analysis_vars_1 = [ high_IV_sorted[i][0] + "_WOE" for i in range(cnt_vars) if i not in deleted_index ] ''' 多变量分析:VIF ''' X = np.matrix(trainData[multi_analysis_vars_1]) VIF_list = [variance_inflation_factor(X, i) for i in range(X.shape[1])] max_VIF = max(VIF_list) print(max_VIF) # 最大的VIF是1.32267733123,因此这一步认为没有多重共线性 multi_analysis = multi_analysis_vars_1 #%% ''' 第六步:逻辑回归模型。 要求: 1,变量显著 2,符号为负 ''' ### (1)将多变量分析的后变量带入LR模型中 y = trainData['y'] X = trainData[multi_analysis]
# print("Info : ",boston.info()) # print("Shape : ",boston.shape) # # print(boston_data.keys()) # print(boston_data.DESCR) # # #Scaling the data scalar = sklearn.preprocessing.StandardScaler() scaled_boston = scalar.fit_transform(boston) #print(type(scaled_boston)) #Checking for multi-coliearity vif_1 = pd.DataFrame() vif_1['VIF'] = [ variance_inflation_factor(scaled_boston, i) for i in range(scaled_boston.shape[1]) ] vif_1['features'] = boston_data.feature_names #print("VIF : \n",(vif_1)) #Model1 x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(scaled_boston, target, test_size=0.2, random_state=22) lin_reg_1 = LinearRegression() lin_reg_1.fit(x_train_1, y_train_1) score_1 = r2_score(lin_reg_1.predict(x_test_1), y_test_1)
def variance_inflation_factors(df, labels): df = df[labels] fill = max([len(name) for name in df.columns]) for index, name in enumerate(df.columns): vif = variance_inflation_factor(df.values, index) print "{:{fill}} {:>7.1f}".format(name, vif, fill=fill)
print(stats.skew(Chunkiboi.drop(['Company'], axis = 1), nan_policy='omit')) print(stats.skew(Chunkiboi['Wordcount_Review_log'])) print(stats.skew(Chunkiboi['Useful_log'])) #plots = sns.pairplot(Chunkiboi_dropped) #VIF X1 = sm.tools.add_constant(Chunkiboi.drop(['Company'], axis = 1).dropna()) series = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index = X1.columns) print(series) #Regressions Chunkiboi_dropped = Chunkiboi.dropna() #Null models Xnull = Chunkiboi_dropped.drop(['Wordcount_Review_log', 'Useful_log', 'Readability', 'Diagnosticity', 'MRFreq', 'Wordcount Response', 'Response Speed in Days', 'Company', 'Review Count'], axis=1) Xnull = sm.add_constant(Xnull) Ynull1 = Chunkiboi_dropped[['Wordcount_Review_log']] modelnull1 = sm.OLS(Ynull1, Xnull).fit() ## sm.OLS(output, input) #predictionsnull1 = modelnull1.predict(Xnull)
def one_hot_encoding(dataset, drop=True): """This function performs the one-hot encoding of the categorical variables""" ocean_prox = dataset['ocean_proximity'] count = ocean_prox.value_counts() print() cols = list(count.keys()) # list of distinct categorical values for col in cols: new_feat = np.zeros( len(ocean_prox)) # inizialize new column with zeros idx = np.where( ocean_prox == col)[0] # find indexes of the current column value new_feat[idx] = 1 # set to 1 values corresponding to indexes dataset[col] = new_feat # create the new column in the dataset dataset.pop('ocean_proximity') # removing old column if drop: # if drop flag is set to true # removing features with high VIF data_copy = dataset.copy() # create a copy of the dataset label = data_copy['median_house_value'] # save labels variable data_copy.pop('median_house_value') # drop labels column # dataset standardization feat_to_skip = [ 'NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND', 'median_house_value' ] for key in data_copy.keys(): if key in feat_to_skip: # one of k features and labels don't have to continue # be normalized data_copy[key] = standardizer( data_copy[key]) # standardization of each feature while True: vif = pd.DataFrame( ) # create the Variance Inflator Factor dataframe vif['Feature'] = data_copy.columns vif['VIF'] = [ variance_inflation_factor(data_copy.values, i) for i in range(data_copy.shape[1]) ] # print current vif dataframe print(vif.round(1)) print() # print(vif.round(1).sort_values(by='VIF', ascending=False).to_latex(index=False)) # print() # sort values from the maximum to the minimum vif_sort = vif.sort_values(by='VIF', ascending=False, ignore_index=True) if vif_sort['VIF'][ 0] > 5.0: # if the maximum VIF is greater than 5.0 max_feat = vif_sort['Feature'][ 0] # take the feature with maximum VIF data_copy.pop(max_feat) # and remove it from the dataset print(max_feat + ' has been dropped') else: break # otherwise, while is concluded data_copy['median_house_value'] = label # ripristinate labels column dataset = dataset[data_copy.keys( )] # take non standardized values in original dataset return dataset
def compute_regression_csa(x, y, p_in, p_out, contrast, path_model): """ Compute stepwise model and complete linear model of CSA. Save both models, compare and analyse residuals. Apply normalization method from model and compute COV. Args: x (panda.DataFrame): Data of predictors y (panda.DataFrame): Data of CSA p_in (float): include a predictor if its p-value < p_in for stepwise ** p_in <= p_out p_out (float): exclude a predictor if its p-value > p_out for stepwise contrast (str): Contrast of the image that CSA value was computed from path_model (str): Path of the result folder of the models Return: COV_step, COV_full """ # Creates directory for results of CSA model for this contrast if doesn't exists path_model_contrast = os.path.join(path_model, contrast) if not os.path.exists(path_model_contrast): os.mkdir(path_model_contrast) # Computes stepwise linear regression with p_value logger.info("Stepwise linear regression {}:".format(contrast)) selected_predictors = compute_stepwise(x, y, p_in, p_out) logger.info('For ' + contrast + ' selected predictors are : {}'.format(selected_predictors)) # Generates model with selected predictors from stepwise model = generate_linear_model(x, y, selected_predictors) # Compute VIF X = sm.add_constant(x[selected_predictors]) vif_data = pd.Series( [variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns) logger.info('VIF of predictors: \n{}'.format(vif_data)) # Apply normalization method COV_step = apply_normalization(y, x, model.params) m1_name = 'stepwise_' + contrast # Saves summary of the model and the coefficients of the regression save_model(model, m1_name, path_model_contrast, x=x[selected_predictors]) # Generates linear regression with all predictors model_full = generate_linear_model(x, y) m2_name = 'fullLin_' + contrast # Compute VIF X = sm.add_constant(x) vif_data_full = pd.Series( [variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns) logger.info('VIF of predictors: \n{}'.format(vif_data_full)) # Apply normalization method COV_full = apply_normalization(y, x, model_full.params) # Saves summary of the model and the coefficients of the regression save_model(model_full, m2_name, path_model_contrast, x=x) # Compares full and reduced models with F_value, R^2,... compared_models = compare_models(model, model_full, m1_name, m2_name) logger.info('Comparing models: {}'.format(compared_models)) compared_models_filename = os.path.join(path_model_contrast, 'compared_models') + '.csv' df_to_csv(compared_models, compared_models_filename) # Saves to .csv # Residual analysis logger.info('\nAnalysing residuals...') analyse_residuals(model, m1_name, data=pd.concat([x, y], axis=1), path=os.path.join(path_model_contrast, 'residuals')) analyse_residuals(model_full, m2_name, data=pd.concat([x, y], axis=1), path=os.path.join(path_model_contrast, 'residuals')) return COV_step, COV_full
correlation_matrix = data.corr() print("Correlation Matrix\n", correlation_matrix) print() ##################################### ## VIF (Variance Inflation Factor) # # VIF = 1 / (1 - R^2) ##################################### from patsy import dmatrices from statsmodels.stats.outliers_influence import variance_inflation_factor y, x = dmatrices(f"{data.columns[0]} ~ {' + '.join(data.columns[1:])}", data, return_type='dataframe') vif = pd.DataFrame() vif["VIF Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])] vif["features"] = x.columns print("VIF (Varriance Inflation Factor)") print(vif) print() my_vif = [] for i in range(1, len(data.columns)): lm = smf.ols(f"{data.columns[i]} ~ {' + '.join(data.columns[1:i].append(data.columns[i+1:]))}", data).fit() lm.summary() print() vif_for_i_th_X = round(1 / (1 - lm.rsquared), 6) my_vif.append(vif_for_i_th_X) vif2 = pd.DataFrame(my_vif, columns=['VIF Factor'])
print(model.params) print(model.bic) ##### 对模型中的每个变量做wald 卡方检验 for col in model.params.index: result = model.wald_test(col) print(str(col) + " wald test: " + str(result.pvalue)) ### 查看VIF值 from statsmodels.stats.outliers_influence import variance_inflation_factor train_X_M = np.matrix(train_all_train[list(model.params.index)]) VIF_list = [ variance_inflation_factor(train_X_M, i) for i in range(train_X_M.shape[1]) ] ### 重新训练模型 ## model = smf.Logit(train_all_train['Defaulter'], train_all_train[list(model.params.index)]).fit() ### from sklearn.metrics import auc, roc_curve, roc_auc_score from sklearn.metrics import precision_score, recall_score, accuracy_score ## 用拟合好的模型预测训练集 ## 首先将数据集的X和Y进行区分 train_all_train_X = train_all_train[list(model.params.index)] train_all_train_Y = train_all_train['Defaulter']
# + from patsy import dmatrices from statsmodels.stats.outliers_influence import variance_inflation_factor features_vif = "+".join(X_train_log.columns) features_target = pd.concat([X_train_log, y_train_log], axis="columns") y_VIF, X_VIF = dmatrices('log_saleprice ~' + features_vif, features_target, return_type="dataframe") vif = pd.DataFrame() vif["VIF Factor"] = [ variance_inflation_factor(X_VIF.values, i) for i in range(X_VIF.shape[1]) ] vif["features"] = X_VIF.columns vif.round(1) # - # - There are no features with a VIF factor greater than 5, the general cutoff for multicollinearity "concern." The highest VIF value are 3.1 and 3.2 for adj_ovr_qual and good_ament_ct, respctively. # ## Model Output as Picket Object # - Output of model object to be utilized in prediction # + pycharm={"is_executing": false} from joblib import dump
print("Test data r squared:", lr.score(X_test_tr, y_test)) #Without log 74 train, 71 test //// With log 79 train, 76 test ##############Some STATS####################### df["Price"].skew() y_log = np.log(df["Price"]) print(y_log.skew()) X_incl_const = sm.add_constant(X_train) model = sm.OLS(y_train, X_incl_const) results = model.fit() pd.DataFrame({"coef": results.params, "p-value": round(results.pvalues, 3)}) #Testing for Multicollinearity variance_inflation_factor(exog=X_incl_const.values, exog_idx=1) vif = [] #Threshold about 10. Over 10 is problamatic for i in range(X_incl_const.shape[1]): vif.append(variance_inflation_factor(exog=X_incl_const.values, exog_idx=i)) print(vif) org_coef = pd.DataFrame({ "coef_name": X_incl_const.columns, "vif": np.round(vif, 2) }) print(results.bic) #-129 print(results.rsquared) #0.796 #Model complexity Basian Information Critirium #Reduced model #1 exluding INUS
inplace=True, axis=1) print(chat_up_.head()) # In[33]: from statsmodels.tools.tools import add_constant # In[34]: chat_up_ = add_constant(chat_up_) # In[36]: vif = pd.Series([ variance_inflation_factor(chat_up_.values, i) for i in range(1, chat_up_.shape[1]) ], index=chat_up_.columns[1:]) tolerance = 1 / vif # In[37]: print(vif) # In[38]: print(tolerance) # ### also correlation value will show that there is no problem of multicollinearity.So, assumption of multicollinearity has been followed
peca3temp3 = peca3temp3.reset_index(drop = True) peca3 = pd.concat([peca3temp1,peca3temp2,peca3temp3],axis = 0) abc = peca3.describe() peca3['Lumberprice'] = peca3['Lumberprice'].map(lambda x: (x - abc['Lumberprice']['mean'])/abc['Lumberprice']['std']) peca3['new housing'] = peca3['new housing'].map(lambda x: (x - abc['new housing']['mean'])/abc['new housing']['std']) peca3['HPI'] = peca3['HPI'].map(lambda x: (x - abc['HPI']['mean'])/abc['HPI']['std']) peca3['Vacant Housing Units for Sale'] = peca3['Vacant Housing Units for Sale'].map(lambda x: (x - abc['Vacant Housing Units for Sale']['mean'])/abc['Vacant Housing Units for Sale']['std']) peca3['business climate'] = peca3['business climate'].map(lambda x: (x - abc['business climate']['mean'])/abc['business climate']['std']) peca3['unemployment rate'] = peca3['unemployment rate'].map(lambda x: (x - abc['unemployment rate']['mean'])/abc['unemployment rate']['std']) peca3['Rental Vacancy Rate'] = peca3['Rental Vacancy Rate'].map(lambda x: (x - abc['Rental Vacancy Rate']['mean'])/abc['Rental Vacancy Rate']['std']) peca3['CLTV'] = peca3['CLTV'].map(lambda x: (x - abc['CLTV']['mean'])/abc['CLTV']['std']) #VIF pecagroup = peca3.drop(['ID', 'Current Date', 'Current Balance', 'CLDS', 'Original Balance', 'Original Date', 'NLDS',],axis = 1) peca1 = pecagroup.to_numpy() vif = [variance_inflation_factor(peca1, i) for i in range(peca1.shape[1])] print(vif) #Logistic Regression for CLDS == 0 peca = peca3.drop(['ID','Current Date','Current Balance','CLDS','Original Balance','Original Date'],axis = 1) # peca['NLDS'] = peca['NLDS'] - 3 temp = peca.copy() temp = temp.dropna() dy = temp['NLDS'] dx = temp.drop(['NLDS'],axis = 1) dx = temp.drop(['NLDS','Lumberprice','new housing','HPI','unemployment rate','Rental Vacancy Rate','Vacant Housing Units for Sale'],axis = 1) dx = dx.astype('float32') meandx = pd.DataFrame(dx.mean()).T np.corrcoef(dx) train_X,test_X,train_y,test_y = train_test_split(dx,dy,test_size=0.1,train_size=0.1,stratify=dy.values) regressor = LogisticRegression(multi_class='multinomial',solver='saga',penalty='l2',max_iter=300)
data_cleaned = data_cleaned.drop(['Price'], axis=1) # ## Multicollinearity # In[36]: data_cleaned.columns.values # In[37]: from statsmodels.stats.outliers_influence import variance_inflation_factor variables = data_cleaned[['Mileage', 'Year', 'EngineV']] vif = pd.DataFrame() vif["VIF"] = [ variance_inflation_factor(variables.values, i) for i in range(variables.shape[1]) ] vif["features"] = variables.columns # In[38]: vif # In[39]: data_no_multicollinearity = data_cleaned.drop(['Year'], axis=1) # # Create Dummy Variables # In[40]:
cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True, ax=ax) # correlation matrix corr ##### we can clearly see that some independent variables are highly correlated to each other #* lets calculate the vif and remove those variables in order to lessen the complexity of the model. from statsmodels.stats.outliers_influence import variance_inflation_factor from statsmodels.tools.tools import add_constant fvif = df2.iloc[:, 0:13] X = add_constant(fvif) vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] pd.DataFrame({'vif': vif[1:]}, index=fvif.columns).T #### Since atemp has a High VIF score we will remove it from our data set. cc = [ 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed', 'casual', 'registered', 'count' ] reg = df2.loc[:, cc] train, test = train_test_split(reg, test_size=0.2) model2 = sm.OLS(train.iloc[:, 12], train.iloc[:, 0:12]).fit() model2.summary() ##### Since p-value of variables ["season","weathersit", "yr","mnth"]
modelpredicitons['gre_GpaRankLM'] = modelGreGpaRank.predict(dfadmit) print(modelpredicitons.head()) #%% # And let us check the VIF value (watch out for multicollinearity issues) # Import functions from statsmodels.stats.outliers_influence import variance_inflation_factor # Get variables for which to compute VIF and add intercept term X = dfadmit[['gpa', 'rank']] X['Intercept'] = 1 # Compute and view VIF vif = pd.DataFrame() vif["variables"] = X.columns vif["VIF"] = [ variance_inflation_factor(X.values, i) for i in range(X.shape[1]) ] # list comprehension # View results using print print(vif) #%% [markdown] # But rank really should be categorical. # # # Patsy coding # # * Strings and booleans are automatically coded # * Numerical → categorical # * C() function # * level 0 → (0,0,0,...) # * level 1 → (1,0,0,...)
##Multicollinearity test using VIF(variance inflation factor)- #-VIF detects correlation between predictor variables i.e. relationship between them. #-If two predictor variables are correlated then we can say there is presence of Multicollinearity #-Multicollinearity affetcs the regression models so it should not present in our variables #-So for this we do this test using VIF #-If VIF is between 1 to 5 then we say that there is no Multicollinearity #-If VIF>5 then there is a multicollinearity and we need to remove it or reconsider the variables. # lets create dataframe of predictor variables outcome, predictors = dmatrices('fare_amount ~ distance+passenger_count+date+weekday+month+year+hour',train_cab, return_type='dataframe') # Lets calculate VIF for each independant variables form train_cab data VIF = pd.DataFrame() VIF["VIF"] = [variance_inflation_factor(predictors.values, i) for i in range(predictors.shape[1])] VIF["Predictors"] = predictors.columns VIF #-We can see VIF for all the predictors is within the required range i.e. from 1-5 #-So we can say that multicollinearity is not present in our independant variables # SO AFTER PERFORMING VARIOUS TESTS ON OUR DATA FOR FEATURE SELECTION WE HAVE FOLLOWING OBSERVATIONS # There is no multicollinearity in our data # We will remove 'date' variable from both train and test data. # Select all other variables for our ML models # lets create a copy of our data selected for Machine learning train_cab_selected= train_cab.copy() test_cab_selected= test_cab.copy()
plt.plot(cooks_d, 'o', label="Cook's distance") plt.legend(loc='upper left') ax2 = fig.add_subplot(3,1,3) plt.plot(resid_studentized, 'o', label='studentized_resid') plt.plot(dffits, 'o', label='DFFITS') leg = plt.legend(loc='lower left', fancybox=True) leg.get_frame().set_alpha(0.5) #, fontsize='small') ltext = leg.get_texts() # all the text.Text instance in the legend plt.setp(ltext, fontsize='small') # the legend text fontsize print oi.reset_ramsey(res, degree=3) #note, constant in last column for i in range(1): print oi.variance_inflation_factor(res.model.exog, i) infl = oi.OLSInfluence(res_ols) print infl.resid_studentized_external print infl.resid_studentized_internal print infl.summary_table() print oi.summary_table(res, alpha=0.05)[0] ''' >>> res.resid array([ 4.28571429, 4. , 0.57142857, -3.64285714, -4.71428571, 1.92857143, 10. , -6.35714286, -11. , -1.42857143, 1.71428571, 4.64285714]) >>> infl.hat_matrix_diag array([ 0.10084034, 0.11764706, 0.28571429, 0.20168067, 0.10084034, 0.16806723, 0.11764706, 0.08403361, 0.11764706, 0.28571429,
def test_all(self): d = macrodata.load().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]], prepend=True) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]],prepend=True) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [ 4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.113973), endog_std = ("S.D. dependent var", 18.67447), ssr = ("Sum squared resid", 22530.90), mse_resid_sqrt = ("S.E. of regression", 10.66735), rsquared = ("R-squared", 0.676973), rsquared_adj = ("Adjusted R-squared", 0.673710), fvalue = ("F(2, 198)", 221.0475), f_pvalue = ("P-value(F)", 3.56e-51), resid_acf1 = ("rho", -0.003481), dw = ("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 6) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 3) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2,4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2,4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939]]) # ** result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.257395), endog_std = ("S.D. dependent var", 18.73915), ssr = ("Sum squared resid", 22799.68), mse_resid_sqrt = ("S.E. of regression", 10.70380), rsquared = ("R-squared", 0.676978), rsquared_adj = ("Adjusted R-squared", 0.673731), fvalue = ("F(2, 199)", 90.79971), f_pvalue = ("P-value(F)", 9.53e-29), llf = ("Log-likelihood", -763.9752), aic = ("Akaike criterion", 1533.950), bic = ("Schwarz criterion", 1543.875), hqic = ("Hannan-Quinn", 1537.966), resid_acf1 = ("rho", -0.107341), dw = ("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breush_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split() cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0:lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0:lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) bse_hac = sw.se_cov(cov_hac) assert_almost_equal(res.params, partable[:,0], 5) assert_almost_equal(bse_hac, partable[:,1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6,5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6,5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breushpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
# In[12]: X_scaled # To treat Multicolinearity issue ,variance inflaction factor is used ,where features with vif more than 5 are removed # In[13]: from statsmodels.stats.outliers_influence import variance_inflation_factor vif = pd.DataFrame() vif["vif"] = [variance_inflation_factor(X_scaled,i) for i in range(X_scaled.shape[1])] vif["Features"] = X.columns #let's check the values vif # from the above column it can be seen that none of the feature is exiding the threshold ,thus all features are retained # ###### Split feature and target columns into Train & Test data # In[14]: x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
def run_analysis(df, drop_point=None): ''' Parameters ---------- drop_point : int or list-like, optional Point(s) to drop for analysis. The default is None. Returns ------- Points to potentially drop, if drop_point == None. ''' ##################################### # DROP INFLUENTIAL POINTS IF NEEDED # ##################################### if drop_point: df = df.reset_index().drop(drop_pts).set_index('player_rank') y, X = patsy.dmatrices('games_played ~ rushing_attempts + total_yards', df) ########################### # LINEAR REGRESSION MODEL # ########################## model = sm.OLS(y, X) results = model.fit() results.model.data.design_info = X.design_info coefs = np.round(results.params, 3) print() print(results.summary()) title_print('Model') print('y = {} + {} * rushing_attempts + {} * total_yards'.\ format(coefs[0], coefs[1], coefs[2])) ########################### # DROP INFLUENTIAL POINTS # ########################### # If dropping points, only run to here and then exit function title_print('Significance') if drop_point: print('Points dropped: {}'.format(drop_point)) print('Coefficients: {}'.format(np.round(results.params, 3))) print('R-squared: {}'.format(round(results.rsquared, 3))) return else: print('Coefficients: {}'.format(np.round(results.params, 3))) print('R-squared: {}'.format(round(results.rsquared, 3))) ############################## # ANOVA TABLE / SIGNIFICANCE # ############################## # H0: beta_0 = beta_1 = beta_2 # H1: beta_j != 0 # Rushing attempts more significant that total yards # Makes sense because more time in league closely associated with # more chances to run aov_table = sm.stats.anova_lm(results, typ=1) title_print('Analysis of Variance table') print(aov_table) print('\nCalculated F-stat: {}'.format( round(f.ppf(0.025, X.shape[1] - 1, X.shape[0]), 3))) print('Regression F: {}'.format(round(results.fvalue, 2))) print('Regression p: {}'.format(round(results.f_pvalue, 4))) print('---> Regression is significant <---') ######################## # CONFIDENCE INTERVALS # ######################## conf_int = np.round(results.conf_int(), 3) print() title_print('95% Confidence Intervals') print('Intercept: {} to {}'.format(conf_int[0][0], conf_int[0][1])) print('Rushing Attempts: {} to {}'.format(conf_int[1][0], conf_int[1][1])) print('Total yards: {} to {}'.format(conf_int[2][0], conf_int[2][1])) ##################### # MULTICOLLINEARITY # ##################### vif = np.round( [variance_inflation_factor(X, i) for i in range(X.shape[1])], 4) title_print('Multicollinearity') [print('VIF_{}: {}'.format(i, vif[i])) for i, v in enumerate(vif)] ############# # RESIDUALS # ############# # Get residuals and probability for plot residuals = results.resid Prob = [(i - 1 / 2) / len(y) for i in range(len(y))] # Plot residuals vs. fitted values fig, ax = plt.subplots(figsize=(8, 8)) ax.scatter(results.fittedvalues, residuals) ax.axhline(0) ax.set_xlabel('Fitted Values') ax.set_ylabel('Residuals') plt.title('Residuals Versus Predicted Response') plt.show() # Calculate OLS using resid to plot straight line. Get y values from model resid_results = sm.OLS(Prob, sm.add_constant(sorted(residuals))).fit() X_range = np.linspace(min(residuals), max(residuals), len(residuals)) # Normality plot fig = plt.figure(figsize=(8, 8)) plt.scatter(sorted(residuals), Prob) plt.plot(X_range, resid_results.params[0] + resid_results.params[1] * X_range) plt.xlabel('Residual') plt.ylabel('Probability') plt.title('Normal Probability Plot') plt.show() print('---> Heavy-tailed distribution <---') ############ # OUTLIERS # ############ title_print('Outliers / Influence Points') pos_out = (np.argmax(residuals), np.amax(residuals)) neg_out = (np.argmax(-residuals), -np.amax(-residuals)) x_out = (np.argmax(results.fittedvalues), np.amax(results.fittedvalues)) # Visually from residual plot, these 3 points are outliers # Influential points infl = results.get_influence() infl_df = infl.summary_frame() print(infl_df.head().to_string()) print('...continued...') infl_pts = {} # Leverage Points - Hat Diagonal n, p = X.shape[0], X.shape[1] - 1 lev_pt = 2 * p / n dhat_pts = list(infl_df[infl_df['hat_diag'] > lev_pt].index) print('\n***| Hat Diagonal |***') print('Leverage calculation (2 * p \ n) = {}'.format(round(lev_pt, 3))) print('Points where hat diagonal exceeds leverage calculation: {}'.format( dhat_pts)) # Cook's D cook_pts = list(infl_df[infl_df['cooks_d'] > 1].index) print('\n***| Cook\'s D |***') print('Points where Cook\'s D is > 1: {}'.format(cook_pts)) # DFFITS DFFITS_cutoff = 2 * np.sqrt(p / n) DFFITS_pts = list(infl_df[infl_df['dffits'] > DFFITS_cutoff].index) print('\n***| DFFITS |***') print('Points which exceed DFFITS cutoff: {}'.format(DFFITS_pts)) # DFBETAS print('\n***| DFBETAS |***') DFBETAS_cutoff = 2 / np.sqrt(n) DFBETAS_pts = [] for col in infl_df.columns: if 'dfb' in col: temp_dfbeta = list(infl_df[infl_df[col] > DFBETAS_cutoff].index) DFBETAS_pts.extend(temp_dfbeta) print('Points which exceed DFBETAS cutoff for {}: {}'.format( col, list(temp_dfbeta))) # COVRATIO print('\n***| COVRATIO |***') COVRATIO_cutoff_pos = 1 + 3 * p / n COVRATIO_cutoff_neg = 1 - 3 * p / n gt_cutoff = list( compress(range(len(infl.cov_ratio)), infl.cov_ratio > COVRATIO_cutoff_pos)) lt_cutoff = list( compress(range(len(infl.cov_ratio)), infl.cov_ratio < COVRATIO_cutoff_neg)) COVRATIO_pts = gt_cutoff + lt_cutoff print( 'Points which are greater than COVRATIO upper bound cutoff: {}'.format( gt_cutoff)) print('Points which are less than COVRATIO lower bound cutoff: {}'.format( lt_cutoff)) # Most influential points for i in dhat_pts + cook_pts + DFFITS_pts + DFBETAS_pts + COVRATIO_pts: infl_pts[i] = infl_pts.get(i, 0) + 1 most_infl = [ pt for pt in infl_pts if infl_pts[pt] == max(infl_pts.values()) ] print('\n***| MOST INFLUENTIAL POINTS |***') #points in every cutoff print(sorted(most_infl)) # Check who these points are return X, y, most_infl
def myremove_corr_feats(entity, features, pred_varname, random_varname): import copy import numpy as np import pandas as pd from statsmodels.stats import outliers_influence import pprint pp = pprint.PrettyPrinter(indent=4) uncorr_features = copy.copy(features) print " before collinearity cleanup:condition number of features matrix = {0:,}".format(int(np.linalg.cond(entity[uncorr_features]))) uncorr_features_vifs = [] for pos, feat in enumerate(uncorr_features): uncorr_features_vifs.append(outliers_influence.variance_inflation_factor(entity[uncorr_features].values, pos)) # VIF > 10 indicates serious collinearity uncorr_features_series = pd.Series(uncorr_features_vifs, index=uncorr_features) uncorr_features_series = uncorr_features_series.order(ascending=False) print " vifs:" #pp.pprint(uncorr_features_series) print "{0}".format(uncorr_features_series.to_string(float_format=lambda x: "%0.3f" % x)) #chk_features = uncorr_features_series[uncorr_features_series >= 10].index.tolist() chk_features = copy.copy(uncorr_features) if random_varname not in set(chk_features): chk_features.append(random_varname) chk_features.append(pred_varname) features_corr = entity[chk_features].corr() #print " features correlation:" #pp.pprint(feat_corr) remove_features = set([]) for feat_i in range(features_corr.shape[0]): if features_corr.columns[feat_i] in remove_features: continue features_corr_row = features_corr.ix[feat_i] for feat_j in range(features_corr.shape[0]): if features_corr.columns[feat_j] in remove_features: continue if feat_i <= feat_j: continue if abs(features_corr_row[feat_j]) >= 0.6: print "\n corr({0},{1}) = {2:0.4f}".format( features_corr.index[feat_i], features_corr.columns[feat_j], features_corr.ix[feat_i, feat_j]) print " corr({0},{1}) = {2:0.4f}".format( pred_varname, features_corr.columns[feat_i], features_corr.ix[pred_varname, feat_i]) print " corr({0},{1}) = {2:0.4f}".format( pred_varname, features_corr.columns[feat_j], features_corr.ix[pred_varname, feat_j]) if abs(features_corr.ix[pred_varname, feat_i]) > abs(features_corr.ix[pred_varname, feat_j]): remove_features |= set([features_corr.columns[feat_j]]) else: remove_features |= set([features_corr.columns[feat_i]]) remove_features.discard(random_varname) print " removing features:" pp.pprint(remove_features) for feat in remove_features: if feat in set([]): # Override feature removal continue uncorr_features.remove(feat) chk_features.remove(feat) features = uncorr_features print " uncorrelated feats:" pp.pprint(features) # check correlations & remove features until all corrs less than threshold ? print " after collinearity cleanup:condition number of features matrix = {0:,}".format(int(np.linalg.cond(entity[features]))) return features
clf1 = KNeighborsClassifier(n_neighbors=10) clf2 = RandomForestClassifier(random_state=42) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, random_state=42) sclf.fit(X_train, y_train) sclf.score(X_test, y_test) #Stacking classifier using probabilities as Meta-Features clf1 = KNeighborsClassifier(n_neighbors=10) clf2 = RandomForestClassifier(random_state=42) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, use_probas=True, random_state=42) sclf.fit(X_train, y_train) sclf.score(X_test, y_test) #vif variance_inflation_factor(X_train.values, 0) #for testing vif for one variable #vif for all variables for i in range(len(X_train.columns)): print(variance_inflation_factor(X_train.values, i))
'Log_sqftLiving', 'waterfront', 'floors', 'view', 'grade', 'HomeAgeinYear', 'LocationMapping', 'Log_LivingSpaceAvailable', 'Log_NeighbourSpace', 'RenovatedafterYears' ]) #Predicting the Log Price X, y = train_data.loc[:, columnsToTrain], train_data.loc[:, 'Log_price'] #Generating VIF Data #Anything less than with vif < 5 can be considered to have less colinearity vif = pd.DataFrame() New_X = add_constant(X) vif['VIF Factors'] = [ variance_inflation_factor(New_X.values, i) for i in range(New_X.shape[1]) ] vif['Columns'] = New_X.columns print(vif) print('\n') #Splitting the values into training and validation set X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X, y, test_size=0.30) #Polynomial regression since columns like floors, view had polynomial relation with the log of price #Using Degree = 3, because anything greater than 3 leads to overfitting and less than 3 leads to underfitting polynomialVariable = PolynomialFeatures(degree=3) polynomialCurveFitting = polynomialVariable.fit_transform(X_train) polynomialCurveFittingTest = polynomialVariable.fit_transform(X_test) #Generating the test results by taking exponential of log values to get the actual price again
chival['ChiVal'] = chi2(X, y)[0] chival['p-Val'] = chi2(X, y)[1] print(chival.sort_values(by='ChiVal', ascending=False)) del t_train['Parch'], t_train['SibSp'], t_train['Familysize'], t_train['Fare'] del t_test['Parch'], t_test['SibSp'], t_test['Familysize'], t_test['Fare'] #Variable multicolinearity from statsmodels.stats.outliers_influence import variance_inflation_factor vif = pd.DataFrame({ 'Features': X.columns, 'vif': 0, }) vif['vif'] = [ variance_inflation_factor(X[X.columns].values, X.columns.get_loc(var)) for var in X.columns ] print(vif.sort_values(by='vif', ascending=False)) #vif is high Age and fareband del t_train['Age'], t_test['Age'] del t_train['FareBand'], t_test['FareBand'] #before we split the data, lets scale the X data from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X)
have a positive relationship with a response, but because they are related to one another, when both are placed in the same linear model, we might see a negative coefficient on one of the x-variables, when it truly should have a positive relationship with the response. ''' def vifs(x): ''' Input x as a DataFrame, calculates the vifs for each variable in the DataFrame. DataFrame should not have response variable. Returns dictionary where key is column name and value is the vif for that column. Requires scipy.stats be imported as scs ''' vifs = [] for index in range(x.shape[1]): vifs.append(round(variance_inflation_factor(x.values, index),2)) return vifs vifs(x_c) '''[35.29, 1.36, 16.33, 15.21, 1.43]''' vifs(x_p) '''[4.59, 2.1, 2.1]''' '''Part 2''' '''Question 1''' ''' dataset name: prestige variables: y_p = prestige['prestige']
lm=sm.OLS(y_train , x_train).fit() lm.summary() lm1 = sm.OLS(y_train, x_train.drop(['zn','indus','indus','chas','age'], axis=1)).fit() lm1.summary() lm2 = sm.OLS(y_train, x_train.drop(['zn','indus','indus','chas','age','nox','crim','rm','rad','tax','lstat','ptratio'], axis=1)).fit() lm2.summary() from statsmodels.stats.outliers_influence import variance_inflation_factor x_train=x_train.drop(['zn','indus','chas','nox','crim','rm','rad','age','tax','lstat'], axis=1) [variance_inflation_factor(x_train.values, j) for j in range(x_train.shape[1])] #Prediction pred_test=lm2.predict(x_test.drop(['zn','indus','chas','nox','crim','rm','rad','age','tax','lstat','ptratio'],axis=1)) err_test=np.abs(y_test - pred_test) print(err_test) #MAPE import numpy as np def mean_absolute_percentage_error(y_test, pred_test): y_test, pred_test = np.array(y_test), np.array(pred_test) return np.mean(np.abs((y_test - pred_test) / y_test)) * 100 mean_absolute_percentage_error(y_test, pred_test)