def vif():
  global final_html
  global df,df_train,df_test,test_train_created,origin_df
  chi_key = list()
  if request.method == 'POST':
		try:
			Listkey1 = list(MultiDict(request.form).values())
			df1 = df
			for key1 in Listkey1:
				if(key1 <> "Calculate VIF"):
					chi_key.append(key1)
			df1=df.loc[:,chi_key]
			df2 = df1.values
			temp_count = 0
			vif_result=""
			print "chi key",chi_key
			for key1 in chi_key:
				k = of.variance_inflation_factor(df2,temp_count)
				vif_result = vif_result + "<br>" + chi_key[temp_count] + ": " + str(k)
				temp_count = temp_count + 1
			temp_df = df[1:15]		
			final_html = template.s1 + "<br><b> Variance Inflation Factor for selected variables </b><br>"+ vif_result + "<br><br></div>" + temp_df.to_html()
			return final_html
		except ValueError:
			final_html = template.s1 + """<br><font color="lightcoral"> Error. Please select valid values</font> <br><br></div>""" + df[1:15].to_html()
			return final_html
		except KeyError:	
			final_html = template.s1 + """<br><font color="lightcoral"> Error. Please upload file and select valid values </font> <br><br></div>""" + df[1:15].to_html()
			return final_html
  return 'helloo'  
示例#2
0
def get_vifs(df):
    X = sm.add_constant(df)
    col_num = X.shape[1]
    df = X.ix[:, 1:]
    vif_list = [variance_inflation_factor(np.array(X), i) for i in np.arange(1, col_num, 1)]
    result = Series(vif_list, df.columns)
    print "VIF of all columns are: \n", result
def VIF(typ, Currency, x_ls, exog_id):
    
    reg_df=pd.read_excel(ROOT_DIR  + 'cleaned data/regression data/' + typ +'/' + Currency + '_' + typ +'.xlsx')
    
    for x in x_ls:
        reg_df = reg_df[~reg_df[x].isnull()]
        
    mx = reg_df[x_ls].as_matrix()
    
    return outliers_influence.variance_inflation_factor(mx, exog_id)
def test_collinearity(data, explanatory_variables):
    data = numpy.array(data)
    highly_collinear_attr = list()
    vif_list = list()
    for attr in explanatory_variables:
        vif = outliers_influence.variance_inflation_factor(data, 
                                                           explanatory_variables.index(attr))
        vif_list.append(vif)
        if(vif > 5):
            highly_collinear_attr.append(attr)

    print('\nVariance Inflation Factors:')
    print(pandas.DataFrame(vif_list, index=explanatory_variables, columns=['VIF']).T)

    print('\nHighly collinear features:')
    print(highly_collinear_attr)
def rm_vif(X):

    import statsmodels.stats.outliers_influence as smso
    loop=True
    indep = X.copy()
    # print indep.shape
    while loop:
        vifs = np.array([smso.variance_inflation_factor(indep.values, i) for i in xrange(indep.shape[1])])
        max_vif = vifs[1:].max()
        # print max_vif, vifs.mean()
        if max_vif > 30 and vifs.mean() > 10:
            where_vif = vifs[1:].argmax() + 1
            keep = np.arange(indep.shape[1]) != where_vif
            nms = indep.columns.values[where_vif].encode('utf-8') # only ever length 1, so convert unicode
            print (bcolors.FAIL + bcolors.UNDERLINE +
            "\n%s removed due to multicollinearity.\n" + bcolors.ENDC) % nms
            indep = indep.ix[:, keep]
        else:
            loop=False
    # print indep.shape

    return indep
def vif():
  global final_html
  global df,origin_df
  chi_key = list()
  firstkey = ""
  if request.method == 'POST':
		Listkey1 = list(MultiDict(request.form).values())
		Listkey2 = MultiDict(request.form)
		df1 = df
		for key1 in Listkey1:
			if(key1 <> "Calculate VIF"):
				chi_key.append(key1)
		df1=df.loc[:,chi_key]
		df2 = df1.values
		temp_count = 0
		vif_result=""
		for key1 in chi_key:
			k = of.variance_inflation_factor(df2,temp_count)
			vif_result = vif_result + "<br>" + chi_key[temp_count] + ": " + str(k)
			temp_count = temp_count + 1
		temp_df = df[1:15]		
		final_html = template.s1 + "</div><br> VIF Results for selected variables <br>"+ vif_result + "<br>" + temp_df.to_html()
		return final_html
  return 'helloo'  
示例#7
0
ACF_resid=tsa.acf(rlt.resid) # Keep ACF of residuals

"""
誤差項が定常であれば、モデル内の説明変数と被説明変数との間に安定した(一時的に外れても帰ってくるような)関係があることが保証されます。また、多くの経済変数はそもそも非定常ですので、残差が定常の場合、重要な要因がモデルから脱落している可能性も低くなります。

系列相関以外に大切なのは、多重共線性(マルチコリニアリティ)のチェックでしょう。これは、説明変数の間に強い相関がある場合に生じるもので、推定される係数の符号が反転してしまったりしますので厄介です。

以下のようにVIF統計量を計算して、10を大きく上回っていなければ、ひとまず安心と考えます。また、VIFを参照して機械的に判定しなくても、想定される符号と逆の符号を持った説明変数が現れれば、経験的にマルチコに気づくと思います。もっとも、マルチコの解決策は強相関している説明変数のどれかを取り除くくらいしか解決策がありません。

リッジ回帰など、パラメター空間を制約するやり方はそもそもパラメターの不偏性を犠牲にする上に、必ずしもマルチコを解消させる保障がないため、歪めますので、計量経済学では推奨されていません。

"""


# Checking Multicolinearity by VIF
VIF=pd.DataFrame([oti.variance_inflation_factor(rlt.model.exog,i) for i in  range(1,rlt.model.exog.shape[1])],index=rlt.model.exog_names[1:],columns=['VIF']) # VIF>10 should be cared



"""
■ トライ&エラーを補助してくれる可視化ツール

回帰分析にはトライ&エラーが付き物です。むしろ、ほとんどの経済現象は、線形式で完全に記述できるはずがありませんから、色々な回帰式を当てはめたり、サンプル期間変えたりして初めて、経済現象の全体像を掴むことができるのだと思います。

statsmodelsには、こうしたトライ&エラーをサポートする可視化ツールが付いています。以下のモジュールをimportしてみましょう。

"""


# /// Graphical Diagnostic Tools /// ---------------------------------
import statsmodels.graphics.regressionplots as regplot
示例#8
0
Test_X = Test.drop('is_click', axis = 1).copy()
Test_Y = Test['is_click'].copy()


########################
# Multicollinearity check
########################

# Check for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor


cols_to_drop_vif = []
# All columns with vif value > 10 were earmarked to be dropped from analysis
for i in range(Train_X.shape[1]-1):
    temp_vif = variance_inflation_factor(Train_X.values, i) # Pass Train_X.values and i (col_number)
    print(Train_X.columns[i], ": ", temp_vif)
    if(temp_vif>10):
        print('Since vif value is greater than 10 so dropping the column ',Train_X.columns[i])
        cols_to_drop_vif.append(Train_X.columns[i])
        
Train_X.drop(cols_to_drop_vif, axis=1, inplace=True)
Test_X.drop(cols_to_drop_vif, axis=1, inplace=True)



########################
# Feature Scaling
########################
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
示例#9
0
lr.fit(X_train,Y_train)
Y_pred=lr.predict(X_test)

from sklearn.metrics import r2_score
rmse=np.sqrt(mean_squared_error(Y_test,Y_pred))
print(rmse)
print(r2_score(Y_test,Y_pred))

#To Dig further lets try to remove multicolinearity using pvalues/VIF
#First lets try VIF
print("Trying VIF")
from statsmodels.stats.outliers_influence import variance_inflation_factor
ndf=df.drop(['shares'],axis=1)
vif = pd.DataFrame()
vif["features"] = ndf.columns
vif["vif_Factor"] = [variance_inflation_factor(ndf.values, i) for i in range(ndf.shape[1])]
Z=vif[~(vif['vif_Factor']>5)]

#Let try to see if we remove columns through VIF we get any improvement
X=df[Z['features']]
Y=df[['shares']]

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=.20,random_state=85)

from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,Y_train)
Y_pred=lr.predict(X_test)

from sklearn.metrics import r2_score

#Assessing collinearity using the condition number

model_fitted = sm.ols(formula = 'Edad ~ Abertura_A + Abertura_B + Raiz_A + Raiz_B + Superficie_A +Superficie_B ', data=wild_boar_data).fit() # this is the model

print model_fitted.summary() #shows OLS regression output

#Assessing multicollinearity using the variance inflation factor
wb_data = wild_boar_data.as_matrix() #LO HE QUITADO
X = wb_data[:,2:]
Y = wb_data[:,0]
zone = wb_data[:,1]
vif_wild_boar = []
for i in range(X.shape[1]):
    vif_wild_boar.append(vif.variance_inflation_factor(X,i))

print("######################################################################")
print("Variance inflation factor")
print vif_wild_boar


###############################################################################
#
# Split train and test
#
###############################################################################
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
###############################################################################
#
# PCA and PLSR analysis 
示例#11
0
# **Logistic regression using statsmodels [Model 1]**

# In[55]:

# Performing logistic regression using stats models
X_train_sm = sm.add_constant(X_train[col])
logm1 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial()).fit()
logm1.summary()

# In[56]:

# Calculating the VIF values
VIF = pd.DataFrame()
VIF['Features'] = X_train[col].columns
VIF['VIF'] = [
    variance_inflation_factor(X_train[col].values, i)
    for i in range(X_train[col].shape[1])
]
VIF['VIF'] = round(VIF['VIF'], 2)
VIF = VIF.sort_values(by='VIF', ascending=False)
VIF

# The VIF Value for all the columns are less than 5. But there are few columns which has high p-values. So based on high p-value, the column `Had a Phone Conversation` should be dropped.

# In[57]:

# Dropping the column with high p-value
col = col.drop('Had a Phone Conversation')
col

# **Logistic regression using statsmodels [Model 2]**
X_smo, y_smo = smo.fit_sample(X_train, y_train)
insample_smo = pd.DataFrame(X_smo).merge(pd.DataFrame(y_smo),left_index=True,right_index=True)
insample_smo.columns = ['FICO', 'DTI', 'OLTV', 'Units_234','First Time Homebuyer Flag_Y', 'Occupancy_I', 'Occupancy_S',
       'Channel_C', 'Channel_R','Channel_T', 'Property Type_CO', 'Property Type_CP','Property Type_MH', 'Property Type_SF', 'Purpose_C', 'Purpose_N',
       'Number of borrowers_1','Default Status']
insample_smo['Default Status'].value_counts()[1]/insample_smo['Default Status'].value_counts().sum()
insample_smo = insample_smo.sample(frac=1).reset_index(drop=True)
X_train = insample_smo.drop(['Default Status'],axis=1)
y_train = insample_smo.loc[:, 'Default Status']
'''

# Multicolinearity check
vif = pd.DataFrame()
vif["features"] = X_train.columns
vif["VIF Factor"] = [
    variance_inflation_factor(X_train.values, i)
    for i in range(X_train.shape[1])
]

# Build models
'''Logistic Regression'''
# Logistic regression with stats
logit_model = sm.Logit(y_train, sm.add_constant(X_train)).fit()
print('ALL', logit_model.summary())

# Logistic regression with sklearn
log = LogisticRegression(random_state=0, solver='lbfgs')
modeloutcome(log, X_train, y_train, X_test, y_test)
searchthreshold(log, [0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
learningcurve(log, X_train, y_train)
log.coef_
y_test = df_test.pop('Salary')
x_test = df_test

x_test_lm = sm.add_constant(x_test)
x_test_lm.shape

y_pred_lm = lr.predict(x_test_lm)
r2_test = r2_score(y_test, y_pred_lm)

#VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['Features'] = x_train_lm.columns
vif['VIF'] = [
    variance_inflation_factor(x_train_lm.values, i)
    for i in range(x_train_lm.shape[1])
]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by="VIF", ascending=False)
vif

x_train_lm_1 = x_train_lm.drop('Interview_Score', axis=1)
lr_1 = sm.OLS(y_train, x_train_lm_1).fit()
lr_1.summary()

x_test_lm_1 = x_test_lm.drop('Interview_Score', axis=1)
y_pred_lm_1 = lr_1.predict(x_test_lm_1)
r2_test_1 = r2_score(y_test, y_pred_lm_1)

x_train_lm_2 = x_train_lm_1.drop('Test_Marks', axis=1)
示例#14
0
print("intersection punto = ", regr.intercept_)
print("NO2 = ", regr.coef_[0], "Viento_MAX + ", regr.coef_[1], "T_MAX + ",
      regr.intercept_)

## NO2 con Viento_Max, T_Max y Lluvia
Viento_T_lluvia = data_set[['Lluvia', 'T_MAX', 'Viento_MAX']]
regr.fit(Viento_T_lluvia, NO2)
print("R cuadrado = ", regr.score(Viento_T_lluvia, NO2))
print("coeff lineal = ", regr.coef_)
print("intersection punto = ", regr.intercept_)
print("NO2 = ", regr.coef_[2], "Viento_MAX + ", regr.coef_[1], "T_MAX + ",
      regr.coef_[0], "Lluvia + ", regr.intercept_)

## Multicolinealidad
vif = [
    variance_inflation_factor(data_set.values, i)
    for i in range(data_set.shape[1])
]
for i in range(len(vif)):
    print(vif[i])  # V.I.F. = 1/(1-R^2)

## Polinómico para explicar NO2
for i in range(1, 5):
    poly = np.polyfit(data_set['NO2'],
                      data_set[['Viento_MAX', 'T_MAX', 'Lluvia']], i)
    print(poly)
# the equations only link one attribute with NO2. We did not manage to find the equation linking NO2 with all the three attributes at the same time

## Regresión múltiple no lineal basado en árboles
multArbol = DecisionTreeRegressor()
multArbol.fit(Viento_T_lluvia, NO2)
示例#15
0
import statsmodels.formula.api as smf

from statsmodels.stats.outliers_influence import variance_inflation_factor

lr = sm.OLS(y_train, X_train)

lr_model = lr.fit()

lr_preds = lr_model.predict(X_test)

lr_preds.summary()

# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [
    variance_inflation_factor(X_train, i) for i in range(X_train.shape[1])
]
vif["features"] = X_train.columns

#
#
#from sklearn.preprocessing import PolynomialFeatures
#
#
#poly = PolynomialFeatures(degree=2)
#
#X_train = poly.fit_transform(X_train)
#
#y_train = poly.fit_transform(y_train)

from helper import rmse
示例#16
0
# plt.scatter(df['collected debris'], q_column['q_val'])
# plt.show()

x = df[features]
#ydf_merged = df['qval']
y = q_column['q_val']

df_xy = df.copy()
df_xy['Qval'] = y
df_xy.to_csv('C:/Users/ulusan.a/Desktop/RL_rep/RL/data_files/xNy_INS2_V3.csv')

# features = [3,4]
x = df[features]
vif = pd.DataFrame()
vif["VIF Factor"] = [
    variance_inflation_factor(x.values, i) for i in range(x.shape[1])
]
vif["features"] = x.columns

# sc = StandardScaler()
# x = sc.fit_transform(x)

kf = KFold(n_splits=5, shuffle=True)

sum = 0
model = LinearRegression()

results = []
i = 0
mse = []
for train, test in kf.split(df):
示例#17
0
            x1_IV = high_IV_sorted[i][1]
            y1_IV = high_IV_sorted[j][1]
            if x1_IV > y1_IV:
                deleted_index.append(j)
            else:
                deleted_index.append(i)

multi_analysis_vars_1 = [
    high_IV_sorted[i][0] + "_WOE" for i in range(cnt_vars)
    if i not in deleted_index
]
'''
多变量分析:VIF
'''
X = np.matrix(trainData[multi_analysis_vars_1])
VIF_list = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
max_VIF = max(VIF_list)
print(max_VIF)
# 最大的VIF是1.32267733123,因此这一步认为没有多重共线性
multi_analysis = multi_analysis_vars_1

#%%
'''
第六步:逻辑回归模型。
要求:
1,变量显著
2,符号为负
'''
### (1)将多变量分析的后变量带入LR模型中
y = trainData['y']
X = trainData[multi_analysis]
示例#18
0
# print("Info    : ",boston.info())
# print("Shape   : ",boston.shape)
#
# print(boston_data.keys())
# print(boston_data.DESCR)
#
#
#Scaling the data
scalar = sklearn.preprocessing.StandardScaler()
scaled_boston = scalar.fit_transform(boston)
#print(type(scaled_boston))

#Checking for multi-coliearity
vif_1 = pd.DataFrame()
vif_1['VIF'] = [
    variance_inflation_factor(scaled_boston, i)
    for i in range(scaled_boston.shape[1])
]
vif_1['features'] = boston_data.feature_names
#print("VIF : \n",(vif_1))

#Model1
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(scaled_boston,
                                                            target,
                                                            test_size=0.2,
                                                            random_state=22)

lin_reg_1 = LinearRegression()
lin_reg_1.fit(x_train_1, y_train_1)

score_1 = r2_score(lin_reg_1.predict(x_test_1), y_test_1)
示例#19
0
def variance_inflation_factors(df, labels):
    df = df[labels]
    fill = max([len(name) for name in df.columns])
    for index, name in enumerate(df.columns):
        vif = variance_inflation_factor(df.values, index)
        print "{:{fill}} {:>7.1f}".format(name, vif, fill=fill)
示例#20
0
print(stats.skew(Chunkiboi.drop(['Company'], axis = 1), nan_policy='omit'))
print(stats.skew(Chunkiboi['Wordcount_Review_log']))
print(stats.skew(Chunkiboi['Useful_log']))





#plots = sns.pairplot(Chunkiboi_dropped)




#VIF
X1 =  sm.tools.add_constant(Chunkiboi.drop(['Company'], axis = 1).dropna())
series = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index = X1.columns)
print(series)



#Regressions
Chunkiboi_dropped = Chunkiboi.dropna()

#Null models
Xnull = Chunkiboi_dropped.drop(['Wordcount_Review_log', 'Useful_log', 'Readability', 'Diagnosticity',
 'MRFreq', 'Wordcount Response', 'Response Speed in Days', 'Company', 'Review Count'], axis=1)
Xnull = sm.add_constant(Xnull)

Ynull1 = Chunkiboi_dropped[['Wordcount_Review_log']]
modelnull1 = sm.OLS(Ynull1, Xnull).fit() ## sm.OLS(output, input)
#predictionsnull1 = modelnull1.predict(Xnull)
示例#21
0
def one_hot_encoding(dataset, drop=True):
    """This function performs the one-hot encoding of the categorical variables"""
    ocean_prox = dataset['ocean_proximity']
    count = ocean_prox.value_counts()
    print()
    cols = list(count.keys())  # list of distinct categorical values

    for col in cols:
        new_feat = np.zeros(
            len(ocean_prox))  # inizialize new column with zeros
        idx = np.where(
            ocean_prox == col)[0]  # find indexes of the current column value
        new_feat[idx] = 1  # set to 1 values corresponding to indexes
        dataset[col] = new_feat  # create the new column in the dataset

    dataset.pop('ocean_proximity')  # removing old column

    if drop:  # if drop flag is set to true
        # removing features with high VIF
        data_copy = dataset.copy()  # create a copy of the dataset
        label = data_copy['median_house_value']  # save labels variable
        data_copy.pop('median_house_value')  # drop labels column

        # dataset standardization
        feat_to_skip = [
            'NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND',
            'median_house_value'
        ]

        for key in data_copy.keys():
            if key in feat_to_skip:  # one of k features and labels don't have to
                continue  # be normalized
            data_copy[key] = standardizer(
                data_copy[key])  # standardization of each feature

        while True:
            vif = pd.DataFrame(
            )  # create the Variance Inflator Factor dataframe
            vif['Feature'] = data_copy.columns
            vif['VIF'] = [
                variance_inflation_factor(data_copy.values, i)
                for i in range(data_copy.shape[1])
            ]

            # print current vif dataframe
            print(vif.round(1))
            print()
            # print(vif.round(1).sort_values(by='VIF', ascending=False).to_latex(index=False))
            # print()

            # sort values from the maximum to the minimum
            vif_sort = vif.sort_values(by='VIF',
                                       ascending=False,
                                       ignore_index=True)
            if vif_sort['VIF'][
                    0] > 5.0:  # if the maximum VIF is greater than 5.0
                max_feat = vif_sort['Feature'][
                    0]  # take the feature with maximum VIF
                data_copy.pop(max_feat)  # and remove it from the dataset
                print(max_feat + ' has been dropped')
            else:
                break  # otherwise, while is concluded

        data_copy['median_house_value'] = label  # ripristinate labels column
        dataset = dataset[data_copy.keys(
        )]  # take non standardized values in original dataset

    return dataset
示例#22
0
def compute_regression_csa(x, y, p_in, p_out, contrast, path_model):
    """
    Compute stepwise model and complete linear model of CSA. Save both models, compare and analyse residuals. Apply normalization method from model and compute COV.
    Args:
        x (panda.DataFrame): Data of predictors
        y (panda.DataFrame): Data of CSA
        p_in (float): include a predictor if its p-value < p_in for stepwise ** p_in <= p_out
        p_out (float): exclude a predictor if its p-value > p_out for stepwise
        contrast (str): Contrast of the image that CSA value was computed from
        path_model (str): Path of the result folder of the models
    Return:
        COV_step, COV_full
    """
    # Creates directory for results of CSA model for this contrast if doesn't exists
    path_model_contrast = os.path.join(path_model, contrast)
    if not os.path.exists(path_model_contrast):
        os.mkdir(path_model_contrast)

    # Computes stepwise linear regression with p_value
    logger.info("Stepwise linear regression {}:".format(contrast))
    selected_predictors = compute_stepwise(x, y, p_in, p_out)
    logger.info('For ' + contrast +
                ' selected predictors are : {}'.format(selected_predictors))

    # Generates model with selected predictors from stepwise
    model = generate_linear_model(x, y, selected_predictors)

    # Compute VIF
    X = sm.add_constant(x[selected_predictors])
    vif_data = pd.Series(
        [variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
        index=X.columns)
    logger.info('VIF of predictors: \n{}'.format(vif_data))

    # Apply normalization method
    COV_step = apply_normalization(y, x, model.params)
    m1_name = 'stepwise_' + contrast
    # Saves summary of the model and the coefficients of the regression
    save_model(model, m1_name, path_model_contrast, x=x[selected_predictors])

    # Generates linear regression with all predictors
    model_full = generate_linear_model(x, y)
    m2_name = 'fullLin_' + contrast

    # Compute VIF
    X = sm.add_constant(x)
    vif_data_full = pd.Series(
        [variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
        index=X.columns)
    logger.info('VIF of predictors: \n{}'.format(vif_data_full))

    # Apply normalization method
    COV_full = apply_normalization(y, x, model_full.params)

    # Saves summary of the model and the coefficients of the regression
    save_model(model_full, m2_name, path_model_contrast, x=x)

    # Compares full and reduced models with F_value, R^2,...
    compared_models = compare_models(model, model_full, m1_name, m2_name)
    logger.info('Comparing models: {}'.format(compared_models))
    compared_models_filename = os.path.join(path_model_contrast,
                                            'compared_models') + '.csv'
    df_to_csv(compared_models, compared_models_filename)  # Saves to .csv

    # Residual analysis
    logger.info('\nAnalysing residuals...')
    analyse_residuals(model,
                      m1_name,
                      data=pd.concat([x, y], axis=1),
                      path=os.path.join(path_model_contrast, 'residuals'))
    analyse_residuals(model_full,
                      m2_name,
                      data=pd.concat([x, y], axis=1),
                      path=os.path.join(path_model_contrast, 'residuals'))

    return COV_step, COV_full
correlation_matrix = data.corr()
print("Correlation Matrix\n", correlation_matrix)
print()

#####################################
## VIF (Variance Inflation Factor)
#
# VIF = 1 / (1 - R^2)
#####################################
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

y, x = dmatrices(f"{data.columns[0]} ~ {' + '.join(data.columns[1:])}", data, return_type='dataframe')

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["features"] = x.columns
print("VIF (Varriance Inflation Factor)")
print(vif)
print()


my_vif = []
for i in range(1, len(data.columns)):
    lm = smf.ols(f"{data.columns[i]} ~ {' + '.join(data.columns[1:i].append(data.columns[i+1:]))}", data).fit()
    lm.summary()
    print()
    vif_for_i_th_X = round(1 / (1 - lm.rsquared), 6)
    my_vif.append(vif_for_i_th_X)

vif2 = pd.DataFrame(my_vif, columns=['VIF Factor'])
示例#24
0
print(model.params)

print(model.bic)

##### 对模型中的每个变量做wald 卡方检验
for col in model.params.index:
    result = model.wald_test(col)
    print(str(col) + " wald test: " + str(result.pvalue))

### 查看VIF值
from statsmodels.stats.outliers_influence import variance_inflation_factor

train_X_M = np.matrix(train_all_train[list(model.params.index)])

VIF_list = [
    variance_inflation_factor(train_X_M, i) for i in range(train_X_M.shape[1])
]

### 重新训练模型 ##
model = smf.Logit(train_all_train['Defaulter'],
                  train_all_train[list(model.params.index)]).fit()

###
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.metrics import precision_score, recall_score, accuracy_score

## 用拟合好的模型预测训练集
## 首先将数据集的X和Y进行区分
train_all_train_X = train_all_train[list(model.params.index)]
train_all_train_Y = train_all_train['Defaulter']
示例#25
0
# +
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

features_vif = "+".join(X_train_log.columns)

features_target = pd.concat([X_train_log, y_train_log], axis="columns")

y_VIF, X_VIF = dmatrices('log_saleprice ~' + features_vif,
                         features_target,
                         return_type="dataframe")

vif = pd.DataFrame()
vif["VIF Factor"] = [
    variance_inflation_factor(X_VIF.values, i) for i in range(X_VIF.shape[1])
]
vif["features"] = X_VIF.columns

vif.round(1)

# -

# - There are no features with a VIF factor greater than 5, the general cutoff for multicollinearity "concern." The highest VIF value are 3.1 and 3.2 for adj_ovr_qual and good_ament_ct, respctively.

# ## Model Output as Picket Object
# - Output of model object to be utilized in prediction

# + pycharm={"is_executing": false}
from joblib import dump
示例#26
0
print("Test data r squared:", lr.score(X_test_tr, y_test))

#Without log 74 train, 71 test //// With log 79 train, 76 test

##############Some STATS#######################
df["Price"].skew()
y_log = np.log(df["Price"])
print(y_log.skew())

X_incl_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_incl_const)
results = model.fit()
pd.DataFrame({"coef": results.params, "p-value": round(results.pvalues, 3)})

#Testing for Multicollinearity
variance_inflation_factor(exog=X_incl_const.values, exog_idx=1)

vif = []  #Threshold about 10. Over 10 is problamatic
for i in range(X_incl_const.shape[1]):
    vif.append(variance_inflation_factor(exog=X_incl_const.values, exog_idx=i))
print(vif)

org_coef = pd.DataFrame({
    "coef_name": X_incl_const.columns,
    "vif": np.round(vif, 2)
})
print(results.bic)  #-129
print(results.rsquared)  #0.796

#Model complexity Basian Information Critirium
#Reduced model #1 exluding INUS
              inplace=True,
              axis=1)
print(chat_up_.head())

# In[33]:

from statsmodels.tools.tools import add_constant

# In[34]:

chat_up_ = add_constant(chat_up_)

# In[36]:

vif = pd.Series([
    variance_inflation_factor(chat_up_.values, i)
    for i in range(1, chat_up_.shape[1])
],
                index=chat_up_.columns[1:])

tolerance = 1 / vif

# In[37]:

print(vif)

# In[38]:

print(tolerance)

# ### also correlation value will show that there is no problem of multicollinearity.So, assumption of  multicollinearity has been followed
示例#28
0
peca3temp3 = peca3temp3.reset_index(drop = True)
peca3 = pd.concat([peca3temp1,peca3temp2,peca3temp3],axis = 0)
abc = peca3.describe()
peca3['Lumberprice'] = peca3['Lumberprice'].map(lambda x: (x - abc['Lumberprice']['mean'])/abc['Lumberprice']['std'])
peca3['new housing'] = peca3['new housing'].map(lambda x: (x - abc['new housing']['mean'])/abc['new housing']['std'])
peca3['HPI'] = peca3['HPI'].map(lambda x: (x - abc['HPI']['mean'])/abc['HPI']['std'])
peca3['Vacant Housing Units for Sale']  = peca3['Vacant Housing Units for Sale'].map(lambda  x: (x - abc['Vacant Housing Units for Sale']['mean'])/abc['Vacant Housing Units for Sale']['std'])
peca3['business climate'] = peca3['business climate'].map(lambda x: (x - abc['business climate']['mean'])/abc['business climate']['std'])
peca3['unemployment rate'] = peca3['unemployment rate'].map(lambda x: (x - abc['unemployment rate']['mean'])/abc['unemployment rate']['std'])
peca3['Rental Vacancy Rate'] = peca3['Rental Vacancy Rate'].map(lambda x: (x - abc['Rental Vacancy Rate']['mean'])/abc['Rental Vacancy Rate']['std'])
peca3['CLTV'] = peca3['CLTV'].map(lambda x: (x - abc['CLTV']['mean'])/abc['CLTV']['std'])
#VIF
pecagroup = peca3.drop(['ID', 'Current Date', 'Current Balance', 'CLDS', 'Original Balance',
       'Original Date', 'NLDS',],axis = 1)
peca1 = pecagroup.to_numpy()
vif = [variance_inflation_factor(peca1, i) for i in range(peca1.shape[1])]
print(vif)

#Logistic Regression for CLDS == 0
peca = peca3.drop(['ID','Current Date','Current Balance','CLDS','Original Balance','Original Date'],axis = 1)
# peca['NLDS'] = peca['NLDS'] - 3
temp = peca.copy()
temp = temp.dropna()
dy = temp['NLDS']
dx = temp.drop(['NLDS'],axis = 1)
dx = temp.drop(['NLDS','Lumberprice','new housing','HPI','unemployment rate','Rental Vacancy Rate','Vacant Housing Units for Sale'],axis = 1)
dx = dx.astype('float32')
meandx = pd.DataFrame(dx.mean()).T
np.corrcoef(dx)
train_X,test_X,train_y,test_y = train_test_split(dx,dy,test_size=0.1,train_size=0.1,stratify=dy.values)
regressor = LogisticRegression(multi_class='multinomial',solver='saga',penalty='l2',max_iter=300)
data_cleaned = data_cleaned.drop(['Price'], axis=1)

# ## Multicollinearity

# In[36]:

data_cleaned.columns.values

# In[37]:

from statsmodels.stats.outliers_influence import variance_inflation_factor

variables = data_cleaned[['Mileage', 'Year', 'EngineV']]
vif = pd.DataFrame()
vif["VIF"] = [
    variance_inflation_factor(variables.values, i)
    for i in range(variables.shape[1])
]
vif["features"] = variables.columns

# In[38]:

vif

# In[39]:

data_no_multicollinearity = data_cleaned.drop(['Year'], axis=1)

# # Create Dummy Variables

# In[40]:
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True,
            ax=ax)

# correlation matrix
corr

##### we can clearly see that some independent variables are highly correlated to each other
#* lets calculate the vif and remove those variables in order to lessen the complexity of the model.

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
fvif = df2.iloc[:, 0:13]
X = add_constant(fvif)

vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
pd.DataFrame({'vif': vif[1:]}, index=fvif.columns).T

#### Since atemp has a High VIF score we will remove it from our data set.

cc = [
    'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit',
    'temp', 'hum', 'windspeed', 'casual', 'registered', 'count'
]
reg = df2.loc[:, cc]

train, test = train_test_split(reg, test_size=0.2)
model2 = sm.OLS(train.iloc[:, 12], train.iloc[:, 0:12]).fit()
model2.summary()

##### Since p-value of variables ["season","weathersit", "yr","mnth"]
modelpredicitons['gre_GpaRankLM'] = modelGreGpaRank.predict(dfadmit)
print(modelpredicitons.head())

#%%
# And let us check the VIF value (watch out for multicollinearity issues)
# Import functions
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Get variables for which to compute VIF and add intercept term
X = dfadmit[['gpa', 'rank']]
X['Intercept'] = 1

# Compute and view VIF
vif = pd.DataFrame()
vif["variables"] = X.columns
vif["VIF"] = [ variance_inflation_factor(X.values, i) for i in range(X.shape[1]) ] # list comprehension

# View results using print
print(vif)


#%% [markdown]
# But rank really should be categorical. 
# 
# # Patsy coding
# 
# * Strings and booleans are automatically coded
# * Numerical → categorical
#   * C() function
#   * level 0 → (0,0,0,...)
#   * level 1 → (1,0,0,...)
示例#32
0
##Multicollinearity test using VIF(variance inflation factor)-

 
#-VIF detects correlation between predictor variables i.e. relationship between them.
#-If two predictor variables are correlated then we can say there is presence of Multicollinearity
#-Multicollinearity affetcs the regression models so it should not present in our variables
#-So for this we do this test using VIF
#-If VIF is between 1 to 5 then we say that there is no Multicollinearity
#-If VIF>5 then there is a multicollinearity and we need to remove it or reconsider the variables. 


# lets create dataframe of predictor variables
outcome, predictors = dmatrices('fare_amount ~ distance+passenger_count+date+weekday+month+year+hour',train_cab, return_type='dataframe')
# Lets calculate VIF for each independant variables form train_cab data
VIF = pd.DataFrame()
VIF["VIF"] = [variance_inflation_factor(predictors.values, i) for i in range(predictors.shape[1])]
VIF["Predictors"] = predictors.columns
VIF

#-We can see VIF for all the predictors is within the required range i.e. from 1-5
#-So we can say that multicollinearity is not present in our independant variables

# SO AFTER PERFORMING VARIOUS TESTS ON OUR DATA FOR FEATURE SELECTION WE HAVE FOLLOWING OBSERVATIONS
# There is no multicollinearity in our data
# We will remove 'date' variable from both train and test data.
# Select all other variables for our ML models

# lets create a copy of our data selected for Machine learning 
train_cab_selected= train_cab.copy()
test_cab_selected= test_cab.copy()
示例#33
0
        plt.plot(cooks_d, 'o', label="Cook's distance")
        plt.legend(loc='upper left')
        ax2 = fig.add_subplot(3,1,3)
        plt.plot(resid_studentized, 'o', label='studentized_resid')
        plt.plot(dffits, 'o', label='DFFITS')
        leg = plt.legend(loc='lower left', fancybox=True)
        leg.get_frame().set_alpha(0.5) #, fontsize='small')
        ltext = leg.get_texts() # all the text.Text instance in the legend
        plt.setp(ltext, fontsize='small') # the legend text fontsize


    print oi.reset_ramsey(res, degree=3)

    #note, constant in last column
    for i in range(1):
        print oi.variance_inflation_factor(res.model.exog, i)

    infl = oi.OLSInfluence(res_ols)
    print infl.resid_studentized_external
    print infl.resid_studentized_internal
    print infl.summary_table()
    print oi.summary_table(res, alpha=0.05)[0]

'''
>>> res.resid
array([  4.28571429,   4.        ,   0.57142857,  -3.64285714,
        -4.71428571,   1.92857143,  10.        ,  -6.35714286,
       -11.        ,  -1.42857143,   1.71428571,   4.64285714])
>>> infl.hat_matrix_diag
array([ 0.10084034,  0.11764706,  0.28571429,  0.20168067,  0.10084034,
        0.16806723,  0.11764706,  0.08403361,  0.11764706,  0.28571429,
示例#34
0
    def test_all(self):

        d = macrodata.load().data
        #import datasetswsm.greene as g
        #d = g.load('5-1')

        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))

        #simple diff, not growthrate, I want heteroscedasticity later for testing
        endogd = np.diff(d['realinv'])
        exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]],
                            prepend=True)

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]],prepend=True)

        res_ols = OLS(endogg, exogg).fit()
        #print res_ols.params

        mod_g1 = GLSAR(endogg, exogg, rho=-0.108136)
        res_g1 = mod_g1.fit()
        #print res_g1.params

        mod_g2 = GLSAR(endogg, exogg, rho=-0.108136)   #-0.1335859) from R
        res_g2 = mod_g2.iterative_fit(maxiter=5)
        #print res_g2.params


        rho = -0.108136

        #                 coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        partable = np.array([
                        [-9.50990,  0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # ***
                        [ 4.37040,  0.208146, 21.00,  2.93e-052,  3.95993, 4.78086], # ***
                        [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) #    **

        #Statistics based on the rho-differenced data:

        result_gretl_g1 = dict(
        endog_mean = ("Mean dependent var",   3.113973),
        endog_std = ("S.D. dependent var",   18.67447),
        ssr = ("Sum squared resid",    22530.90),
        mse_resid_sqrt = ("S.E. of regression",   10.66735),
        rsquared = ("R-squared",            0.676973),
        rsquared_adj = ("Adjusted R-squared",   0.673710),
        fvalue = ("F(2, 198)",            221.0475),
        f_pvalue = ("P-value(F)",           3.56e-51),
        resid_acf1 = ("rho",                 -0.003481),
        dw = ("Durbin-Watson",        1.993858))


        #fstatistic, p-value, df1, df2
        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]
        #LM-statistic, p-value, df
        arch_4 = [7.30776, 0.120491, 4, "chi2"]

        #multicollinearity
        vif = [1.002, 1.002]
        cond_1norm = 6862.0664
        determinant = 1.0296049e+009
        reciprocal_condition_number = 0.013819244

        #Chi-square(2): test-statistic, pvalue, df
        normality = [20.2792, 3.94837e-005, 2]

        #tests
        res = res_g1  #with rho from Gretl

        #basic

        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 6)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4)
        assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=4)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        #tests
        res = res_g2 #with estimated rho

        #estimated lag coefficient
        assert_almost_equal(res.model.rho, rho, decimal=3)

        #basic
        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 3)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0)
        assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO



        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(2,4))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(2,4))

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=1)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=2)



        '''
        Performing iterative calculation of rho...

                         ITER       RHO        ESS
                           1     -0.10734   22530.9
                           2     -0.10814   22530.9

        Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv
        rho = -0.108136

                         coefficient   std. error   t-ratio    p-value
          -------------------------------------------------------------
          const           -9.50990      0.990456    -9.602    3.65e-018 ***
          ds_l_realgdp     4.37040      0.208146    21.00     2.93e-052 ***
          realint_1       -0.579253     0.268009    -2.161    0.0319    **

        Statistics based on the rho-differenced data:

        Mean dependent var   3.113973   S.D. dependent var   18.67447
        Sum squared resid    22530.90   S.E. of regression   10.66735
        R-squared            0.676973   Adjusted R-squared   0.673710
        F(2, 198)            221.0475   P-value(F)           3.56e-51
        rho                 -0.003481   Durbin-Watson        1.993858
        '''

        '''
        RESET test for specification (squares and cubes)
        Test statistic: F = 5.219019,
        with p-value = P(F(2,197) > 5.21902) = 0.00619

        RESET test for specification (squares only)
        Test statistic: F = 7.268492,
        with p-value = P(F(1,198) > 7.26849) = 0.00762

        RESET test for specification (cubes only)
        Test statistic: F = 5.248951,
        with p-value = P(F(1,198) > 5.24895) = 0.023:
        '''

        '''
        Test for ARCH of order 4

                     coefficient   std. error   t-ratio   p-value
          --------------------------------------------------------
          alpha(0)   97.0386       20.3234       4.775    3.56e-06 ***
          alpha(1)    0.176114      0.0714698    2.464    0.0146   **
          alpha(2)   -0.0488339     0.0724981   -0.6736   0.5014
          alpha(3)   -0.0705413     0.0737058   -0.9571   0.3397
          alpha(4)    0.0384531     0.0725763    0.5298   0.5968

          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491:
        '''

        '''
        Variance Inflation Factors

        Minimum possible value = 1.0
        Values > 10.0 may indicate a collinearity problem

           ds_l_realgdp    1.002
              realint_1    1.002

        VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient
        between variable j and the other independent variables

        Properties of matrix X'X:

         1-norm = 6862.0664
         Determinant = 1.0296049e+009
         Reciprocal condition number = 0.013819244
        '''
        '''
        Test for ARCH of order 4 -
          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491

        Test of common factor restriction -
          Null hypothesis: restriction is acceptable
          Test statistic: F(2, 195) = 0.426391
          with p-value = P(F(2, 195) > 0.426391) = 0.653468

        Test for normality of residual -
          Null hypothesis: error is normally distributed
          Test statistic: Chi-square(2) = 20.2792
          with p-value = 3.94837e-005:
        '''

        #no idea what this is
        '''
        Augmented regression for common factor test
        OLS, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv

                           coefficient   std. error   t-ratio    p-value
          ---------------------------------------------------------------
          const            -10.9481      1.35807      -8.062    7.44e-014 ***
          ds_l_realgdp       4.28893     0.229459     18.69     2.40e-045 ***
          realint_1         -0.662644    0.334872     -1.979    0.0492    **
          ds_l_realinv_1    -0.108892    0.0715042    -1.523    0.1294
          ds_l_realgdp_1     0.660443    0.390372      1.692    0.0923    *
          realint_2          0.0769695   0.341527      0.2254   0.8219

          Sum of squared residuals = 22432.8

        Test of common factor restriction

          Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468
        '''


        ################ with OLS, HAC errors

        #Model 5: OLS, using observations 1959:2-2009:3 (T = 202)
        #Dependent variable: ds_l_realinv
        #HAC standard errors, bandwidth 4 (Bartlett kernel)

        #coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        #for confidence interval t(199, 0.025) = 1.972

        partable = np.array([
        [-9.48167,      1.17709,     -8.055,    7.17e-014, -11.8029, -7.16049], # ***
        [4.37422,      0.328787,    13.30,     2.62e-029, 3.72587, 5.02258], #***
        [-0.613997,     0.293619,    -2.091,    0.0378, -1.19300, -0.0349939]]) # **

        result_gretl_g1 = dict(
                    endog_mean = ("Mean dependent var",   3.257395),
                    endog_std = ("S.D. dependent var",   18.73915),
                    ssr = ("Sum squared resid",    22799.68),
                    mse_resid_sqrt = ("S.E. of regression",   10.70380),
                    rsquared = ("R-squared",            0.676978),
                    rsquared_adj = ("Adjusted R-squared",   0.673731),
                    fvalue = ("F(2, 199)",            90.79971),
                    f_pvalue = ("P-value(F)",           9.53e-29),
                    llf = ("Log-likelihood",      -763.9752),
                    aic = ("Akaike criterion",     1533.950),
                    bic = ("Schwarz criterion",    1543.875),
                    hqic = ("Hannan-Quinn",         1537.966),
                    resid_acf1 = ("rho",                 -0.107341),
                    dw = ("Durbin-Watson",        2.213805))

        linear_logs = [1.68351, 0.430953, 2, "chi2"]
        #for logs: dropping 70 nan or incomplete observations, T=133
        #(res_ols.model.exog <=0).any(1).sum() = 69  ?not 70
        linear_squares = [7.52477, 0.0232283, 2, "chi2"]

        #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4
        lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"]
        lm2_acorr4 = [4.771043, 0.312, 4, "chi2"]
        acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"]

        #break
        cusum_Harvey_Collier  = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2
        #see cusum results in files
        break_qlr = [3.01985, 0.1, 3, 196, "maxF"]  #TODO check this, max at 2001:4
        break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1

        arch_4 = [3.43473, 0.487871, 4, "chi2"]

        normality = [23.962, 0.00001, 2, "chi2"]

        het_white = [33.503723, 0.000003, 5, "chi2"]
        het_breush_pagan = [1.302014, 0.521520, 2, "chi2"]  #TODO: not available
        het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"]


        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]  #not available

        cond_1norm = 5984.0525
        determinant = 7.1087467e+008
        reciprocal_condition_number = 0.013826504
        vif = [1.001, 1.001]

        names = 'date   residual        leverage       influence        DFFITS'.split()
        cur_dir = os.path.abspath(os.path.dirname(__file__))
        fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt')
        lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1,
                            converters={0:lambda s: s})
        #either numpy 1.6 or python 3.2 changed behavior
        if np.isnan(lev[-1]['f1']):
            lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2,
                                converters={0:lambda s: s})

        lev.dtype.names = names

        res = res_ols #for easier copying

        cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False)
        bse_hac =  sw.se_cov(cov_hac)

        assert_almost_equal(res.params, partable[:,0], 5)
        assert_almost_equal(bse_hac, partable[:,1], 5)
        #TODO

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL
        assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        #f-value is based on cov_hac I guess
        #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL
        #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO


        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(6,5))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(6,5))

        linear_sq = smsdia.linear_lm(res.resid, res.model.exog)
        assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6)
        assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7)

        hbpk = smsdia.het_breushpagan(res.resid, res.model.exog)
        assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6)
        assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6)

        hw = smsdia.het_white(res.resid, res.model.exog)
        assert_almost_equal(hw[:2], het_white[:2], 6)

        #arch
        #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.resid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=5)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]]

        infl = oi.OLSInfluence(res_ols)
        #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0]))
        #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag))
        #print np.max(np.abs(lev['influence'] - infl.influence))  #just added this based on Gretl

        #just rough test, low decimal in Gretl output,
        assert_almost_equal(lev['residual'], res.resid, decimal=3)
        assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3)
        assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3)
        assert_almost_equal(lev['influence'], infl.influence, decimal=4)
示例#35
0

# In[12]:


X_scaled


# To treat Multicolinearity issue ,variance inflaction factor is used ,where features with vif more than 5 are removed 

# In[13]:


from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif = pd.DataFrame()
vif["vif"] = [variance_inflation_factor(X_scaled,i) for i in range(X_scaled.shape[1])]
vif["Features"] = X.columns

#let's check the values
vif


# from the above column it can be seen that none of the feature is exiding the threshold ,thus all features are retained 

# ###### Split feature and target columns into Train & Test data 

# In[14]:


x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
示例#36
0
def run_analysis(df, drop_point=None):
    '''
    Parameters
    ----------
    drop_point : int or list-like, optional
        Point(s) to drop for analysis. The default is None.

    Returns
    -------
    Points to potentially drop, if drop_point == None.

    '''
    #####################################
    # DROP INFLUENTIAL POINTS IF NEEDED #
    #####################################
    if drop_point:
        df = df.reset_index().drop(drop_pts).set_index('player_rank')
    y, X = patsy.dmatrices('games_played ~ rushing_attempts + total_yards', df)

    ###########################
    # LINEAR REGRESSION MODEL #
    ##########################
    model = sm.OLS(y, X)
    results = model.fit()
    results.model.data.design_info = X.design_info
    coefs = np.round(results.params, 3)
    print()
    print(results.summary())
    title_print('Model')
    print('y = {} + {} * rushing_attempts + {} * total_yards'.\
          format(coefs[0], coefs[1], coefs[2]))

    ###########################
    # DROP INFLUENTIAL POINTS #
    ###########################
    # If dropping points, only run to here and then exit function
    title_print('Significance')
    if drop_point:
        print('Points dropped: {}'.format(drop_point))
        print('Coefficients: {}'.format(np.round(results.params, 3)))
        print('R-squared: {}'.format(round(results.rsquared, 3)))
        return
    else:
        print('Coefficients: {}'.format(np.round(results.params, 3)))
        print('R-squared: {}'.format(round(results.rsquared, 3)))

    ##############################
    # ANOVA TABLE / SIGNIFICANCE #
    ##############################
    # H0: beta_0 = beta_1 = beta_2
    # H1: beta_j != 0
    # Rushing attempts more significant that total yards
    # Makes sense because more time in league closely associated with
    # more chances to run
    aov_table = sm.stats.anova_lm(results, typ=1)
    title_print('Analysis of Variance table')
    print(aov_table)
    print('\nCalculated F-stat: {}'.format(
        round(f.ppf(0.025, X.shape[1] - 1, X.shape[0]), 3)))
    print('Regression F: {}'.format(round(results.fvalue, 2)))
    print('Regression p: {}'.format(round(results.f_pvalue, 4)))
    print('---> Regression is significant <---')

    ########################
    # CONFIDENCE INTERVALS #
    ########################
    conf_int = np.round(results.conf_int(), 3)
    print()
    title_print('95% Confidence Intervals')
    print('Intercept: {} to {}'.format(conf_int[0][0], conf_int[0][1]))
    print('Rushing Attempts: {} to {}'.format(conf_int[1][0], conf_int[1][1]))
    print('Total yards: {} to {}'.format(conf_int[2][0], conf_int[2][1]))

    #####################
    # MULTICOLLINEARITY #
    #####################
    vif = np.round(
        [variance_inflation_factor(X, i) for i in range(X.shape[1])], 4)
    title_print('Multicollinearity')
    [print('VIF_{}: {}'.format(i, vif[i])) for i, v in enumerate(vif)]

    #############
    # RESIDUALS #
    #############
    # Get residuals and probability for plot
    residuals = results.resid
    Prob = [(i - 1 / 2) / len(y) for i in range(len(y))]

    # Plot residuals vs. fitted values
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.scatter(results.fittedvalues, residuals)
    ax.axhline(0)
    ax.set_xlabel('Fitted Values')
    ax.set_ylabel('Residuals')
    plt.title('Residuals Versus Predicted Response')
    plt.show()

    # Calculate OLS using resid to plot straight line. Get y values from model
    resid_results = sm.OLS(Prob, sm.add_constant(sorted(residuals))).fit()
    X_range = np.linspace(min(residuals), max(residuals), len(residuals))

    # Normality plot
    fig = plt.figure(figsize=(8, 8))
    plt.scatter(sorted(residuals), Prob)
    plt.plot(X_range,
             resid_results.params[0] + resid_results.params[1] * X_range)
    plt.xlabel('Residual')
    plt.ylabel('Probability')
    plt.title('Normal Probability Plot')
    plt.show()
    print('---> Heavy-tailed distribution <---')

    ############
    # OUTLIERS #
    ############
    title_print('Outliers / Influence Points')
    pos_out = (np.argmax(residuals), np.amax(residuals))
    neg_out = (np.argmax(-residuals), -np.amax(-residuals))
    x_out = (np.argmax(results.fittedvalues), np.amax(results.fittedvalues))
    # Visually from residual plot, these 3 points are outliers

    # Influential points
    infl = results.get_influence()
    infl_df = infl.summary_frame()
    print(infl_df.head().to_string())
    print('...continued...')
    infl_pts = {}

    # Leverage Points - Hat Diagonal
    n, p = X.shape[0], X.shape[1] - 1
    lev_pt = 2 * p / n
    dhat_pts = list(infl_df[infl_df['hat_diag'] > lev_pt].index)
    print('\n***| Hat Diagonal |***')
    print('Leverage calculation (2 * p \ n) = {}'.format(round(lev_pt, 3)))
    print('Points where hat diagonal exceeds leverage calculation: {}'.format(
        dhat_pts))

    # Cook's D
    cook_pts = list(infl_df[infl_df['cooks_d'] > 1].index)
    print('\n***| Cook\'s D |***')
    print('Points where Cook\'s D is > 1: {}'.format(cook_pts))

    # DFFITS
    DFFITS_cutoff = 2 * np.sqrt(p / n)
    DFFITS_pts = list(infl_df[infl_df['dffits'] > DFFITS_cutoff].index)
    print('\n***| DFFITS |***')
    print('Points which exceed DFFITS cutoff: {}'.format(DFFITS_pts))

    # DFBETAS
    print('\n***| DFBETAS |***')
    DFBETAS_cutoff = 2 / np.sqrt(n)
    DFBETAS_pts = []
    for col in infl_df.columns:
        if 'dfb' in col:
            temp_dfbeta = list(infl_df[infl_df[col] > DFBETAS_cutoff].index)
            DFBETAS_pts.extend(temp_dfbeta)
            print('Points which exceed DFBETAS cutoff for {}: {}'.format(
                col, list(temp_dfbeta)))

    # COVRATIO
    print('\n***| COVRATIO |***')
    COVRATIO_cutoff_pos = 1 + 3 * p / n
    COVRATIO_cutoff_neg = 1 - 3 * p / n
    gt_cutoff = list(
        compress(range(len(infl.cov_ratio)),
                 infl.cov_ratio > COVRATIO_cutoff_pos))
    lt_cutoff = list(
        compress(range(len(infl.cov_ratio)),
                 infl.cov_ratio < COVRATIO_cutoff_neg))
    COVRATIO_pts = gt_cutoff + lt_cutoff
    print(
        'Points which are greater than COVRATIO upper bound cutoff: {}'.format(
            gt_cutoff))
    print('Points which are less than COVRATIO lower bound cutoff: {}'.format(
        lt_cutoff))

    # Most influential points
    for i in dhat_pts + cook_pts + DFFITS_pts + DFBETAS_pts + COVRATIO_pts:
        infl_pts[i] = infl_pts.get(i, 0) + 1
    most_infl = [
        pt for pt in infl_pts if infl_pts[pt] == max(infl_pts.values())
    ]
    print('\n***| MOST INFLUENTIAL POINTS |***')  #points in every cutoff
    print(sorted(most_infl))

    # Check who these points are
    return X, y, most_infl
def myremove_corr_feats(entity, features, pred_varname, random_varname):

    import copy
    import numpy as np
    import pandas as pd
    from statsmodels.stats import outliers_influence
    import pprint
    pp = pprint.PrettyPrinter(indent=4)


    uncorr_features = copy.copy(features)
    print "		before collinearity cleanup:condition number of features matrix = {0:,}".format(int(np.linalg.cond(entity[uncorr_features])))

    uncorr_features_vifs = []
    for pos, feat in enumerate(uncorr_features):
        uncorr_features_vifs.append(outliers_influence.variance_inflation_factor(entity[uncorr_features].values, pos))

    #	VIF > 10 indicates serious collinearity
    uncorr_features_series = pd.Series(uncorr_features_vifs, index=uncorr_features)
    uncorr_features_series = uncorr_features_series.order(ascending=False)
    print "		vifs:"
    #pp.pprint(uncorr_features_series)
    print "{0}".format(uncorr_features_series.to_string(float_format=lambda x: "%0.3f" % x))

    #chk_features = uncorr_features_series[uncorr_features_series >= 10].index.tolist()
    chk_features = copy.copy(uncorr_features)
    if random_varname not in set(chk_features):
        chk_features.append(random_varname)

    chk_features.append(pred_varname)

    features_corr = entity[chk_features].corr()
    #print "						features correlation:"
    #pp.pprint(feat_corr)

    remove_features = set([])
    for feat_i in range(features_corr.shape[0]):
        if features_corr.columns[feat_i] in remove_features:
                continue
        features_corr_row = features_corr.ix[feat_i]
        for feat_j in range(features_corr.shape[0]):
            if features_corr.columns[feat_j] in remove_features:
                continue
            if feat_i <= feat_j:
                continue
            if abs(features_corr_row[feat_j]) >= 0.6:
                print "\n							corr({0},{1}) = {2:0.4f}".format(
                    features_corr.index[feat_i], features_corr.columns[feat_j], features_corr.ix[feat_i, feat_j])
                print "									corr({0},{1}) = {2:0.4f}".format(
                    pred_varname, features_corr.columns[feat_i], features_corr.ix[pred_varname, feat_i])
                print "									corr({0},{1}) = {2:0.4f}".format(
                    pred_varname, features_corr.columns[feat_j], features_corr.ix[pred_varname, feat_j])
                if abs(features_corr.ix[pred_varname, feat_i]) > abs(features_corr.ix[pred_varname, feat_j]):
                    remove_features |= set([features_corr.columns[feat_j]])
                else:
                    remove_features |= set([features_corr.columns[feat_i]])

    remove_features.discard(random_varname)
    print "						removing features:"
    pp.pprint(remove_features)

    for feat in remove_features:
        if feat in set([]):	# Override feature removal
            continue

        uncorr_features.remove(feat)
        chk_features.remove(feat)

    features = uncorr_features
    print "		uncorrelated feats:"
    pp.pprint(features)
    # check correlations & remove features until all corrs less than threshold ?
    print "		 after collinearity cleanup:condition number of features matrix = {0:,}".format(int(np.linalg.cond(entity[features])))

    return features
示例#38
0
clf1 = KNeighborsClassifier(n_neighbors=10)
clf2 = RandomForestClassifier(random_state=42)
clf3 = GaussianNB()
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                            meta_classifier=lr,
                            random_state=42)
sclf.fit(X_train, y_train)
sclf.score(X_test, y_test)

#Stacking classifier using probabilities as Meta-Features
clf1 = KNeighborsClassifier(n_neighbors=10)
clf2 = RandomForestClassifier(random_state=42)
clf3 = GaussianNB()
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                            meta_classifier=lr,
                            use_probas=True,
                            random_state=42)
sclf.fit(X_train, y_train)
sclf.score(X_test, y_test)

#vif
variance_inflation_factor(X_train.values, 0)  #for testing vif for one variable

#vif for all variables
for i in range(len(X_train.columns)):
    print(variance_inflation_factor(X_train.values, i))
示例#39
0
                                   'Log_sqftLiving', 'waterfront', 'floors',
                                   'view', 'grade', 'HomeAgeinYear',
                                   'LocationMapping',
                                   'Log_LivingSpaceAvailable',
                                   'Log_NeighbourSpace', 'RenovatedafterYears'
                               ])

#Predicting the Log Price
X, y = train_data.loc[:, columnsToTrain], train_data.loc[:, 'Log_price']

#Generating VIF Data
#Anything less than with vif < 5 can be considered to have less colinearity
vif = pd.DataFrame()
New_X = add_constant(X)
vif['VIF Factors'] = [
    variance_inflation_factor(New_X.values, i) for i in range(New_X.shape[1])
]
vif['Columns'] = New_X.columns
print(vif)
print('\n')
#Splitting the values into training and validation set
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
    X, y, test_size=0.30)

#Polynomial regression since columns like floors, view had polynomial relation with the log of price
#Using Degree = 3, because anything greater than 3 leads to overfitting and less than 3 leads to underfitting
polynomialVariable = PolynomialFeatures(degree=3)
polynomialCurveFitting = polynomialVariable.fit_transform(X_train)
polynomialCurveFittingTest = polynomialVariable.fit_transform(X_test)

#Generating the test results by taking exponential of log values to get the actual price again
示例#40
0
chival['ChiVal'] = chi2(X, y)[0]
chival['p-Val'] = chi2(X, y)[1]

print(chival.sort_values(by='ChiVal', ascending=False))

del t_train['Parch'], t_train['SibSp'], t_train['Familysize'], t_train['Fare']
del t_test['Parch'], t_test['SibSp'], t_test['Familysize'], t_test['Fare']

#Variable multicolinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame({
    'Features': X.columns,
    'vif': 0,
})
vif['vif'] = [
    variance_inflation_factor(X[X.columns].values, X.columns.get_loc(var))
    for var in X.columns
]
print(vif.sort_values(by='vif', ascending=False))

#vif is high Age and fareband
del t_train['Age'], t_test['Age']
del t_train['FareBand'], t_test['FareBand']

#before we split the data, lets scale the X data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X = sc.fit_transform(X)
have a positive relationship with a response, but because they are related to one another,
when both are placed in the same linear model, we might see a negative coefficient on
one of the x-variables, when it truly should have a positive relationship with the response.
'''


def vifs(x):
	'''
	Input x as a DataFrame, calculates the vifs for each variable in the DataFrame.
	DataFrame should not have response variable.
	Returns dictionary where key is column name and value is the vif for that column.
	Requires scipy.stats be imported as scs
	'''
    vifs = []
    for index in range(x.shape[1]):
	       vifs.append(round(variance_inflation_factor(x.values, index),2))
    return vifs

vifs(x_c)
'''[35.29, 1.36, 16.33, 15.21, 1.43]'''
vifs(x_p)
'''[4.59, 2.1, 2.1]'''


'''Part 2'''
'''Question 1'''
'''
dataset name:
    prestige
variables:
    y_p = prestige['prestige']
示例#42
0
lm=sm.OLS(y_train , x_train).fit()
lm.summary()   

lm1 = sm.OLS(y_train, x_train.drop(['zn','indus','indus','chas','age'],
                                  axis=1)).fit()
lm1.summary()             
                            

lm2 = sm.OLS(y_train, x_train.drop(['zn','indus','indus','chas','age','nox','crim','rm','rad','tax','lstat','ptratio'],
                                  axis=1)).fit()
lm2.summary()             
             

from statsmodels.stats.outliers_influence import variance_inflation_factor
x_train=x_train.drop(['zn','indus','chas','nox','crim','rm','rad','age','tax','lstat'], axis=1)
[variance_inflation_factor(x_train.values, j) for j in range(x_train.shape[1])]

#Prediction
pred_test=lm2.predict(x_test.drop(['zn','indus','chas','nox','crim','rm','rad','age','tax','lstat','ptratio'],axis=1))
err_test=np.abs(y_test - pred_test)
print(err_test)

#MAPE
import numpy as np

def mean_absolute_percentage_error(y_test, pred_test): 
    y_test, pred_test = np.array(y_test), np.array(pred_test)
    return np.mean(np.abs((y_test - pred_test) / y_test)) * 100

mean_absolute_percentage_error(y_test, pred_test)