Пример #1
0
# 最大的VIF是1.32267733123,因此这一步认为没有多重共线性
multi_analysis = multi_analysis_vars_1

#%%
'''
第六步:逻辑回归模型。
要求:
1,变量显著
2,符号为负
'''
### (1)将多变量分析的后变量带入LR模型中
y = trainData['y']
X = trainData[multi_analysis]
X['intercept'] = [1] * X.shape[0]

LR = sm.Logit(y, X).fit()
summary = LR.summary()
print(summary)
pvals = LR.pvalues
pvals = pvals.to_dict()
print(pvals)

# 有些变量不显著,需要逐步剔除
varLargeP = {k: v for k, v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d: d[1], reverse=True)
while (len(varLargeP) > 0 and len(multi_analysis) > 0):
    # 每次迭代中,剔除最不显著的变量,直到
    # (1) 剩余所有变量均显著
    # (2) 没有特征可选
    varMaxP = varLargeP[0][0]
    print(varMaxP)
Пример #2
0
def fkitbutton():
    global df1
    global valid
    val1 = combo1.get()
    dep_var = val1.split(" ")

    #drop big ctegorical
    for col in list(df1.select_dtypes(include=['object']).columns):
        if len(df1[col].unique()) >= 10:
            df1.drop(col, inplace=True, axis=1)
            valid.drop(col, inplace=True, axis=1)

    df1 = df1.drop(['probbyx'], axis=1, errors='ignore')

    #drop coorelated
    col_corr = set()  # Set of all the names of deleted columns
    corr_matrix = df1.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (abs(corr_matrix.iloc[i, j]) >= 0.8) and (corr_matrix.columns[j]
                                                         not in col_corr):
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
                if colname in df1.columns:
                    del df1[colname]  # deleting the column from the dataset
                    del valid[colname]

    scr.insert(tk.INSERT, "Correlation done ")

    iv()

    scr.insert(tk.INSERT, "IV calcs ")

    #remove iv
    keepcol = iv[(iv.IV >= 0.1)]
    keepcol1 = keepcol['VAR_NAME']
    values = keepcol1.values.tolist()
    xy = val1

    values.insert(0, xy)
    #only delete if exists
    while 'probbyx' in values:
        values.remove('probbyx')

    df1 = df1[values]
    valid = valid[values]
    scr.insert(tk.INSERT, "removedIV")

    #on hot encoding
    onehotencoding()

    df1 = df1.drop(['probbyx'], axis=1, errors='ignore')
    df1['tagxxc'] = df1[dep_var]

    if (df1.tagxxc.nunique() == 2):
        del df1['tagxxc']
        ay = pd.DataFrame(df1[dep_var])
        #  alun = np.ravel(df1[dep_var])
        ya = list(ay.columns)
        Xa = df1.drop(ya, 1)
        Xa.insert(0, 'Intercept', 1)
        # names = list(Xa.columns)
        scr.insert(tk.INSERT, '\n\n')

        #      scr.insert(tk.INSERT, "Logistic Regression between " + val1 + " ~ " + val2 +
        #                " : ")
        scr.insert(tk.INSERT, "Logistic Regression ")
        scr.insert(tk.INSERT, '\n\n')
        logit_model = sm.Logit(ay, Xa)
        result = logit_model.fit()
        df1['probbyx'] = result.predict(Xa)
        valx = valid.drop(ya, 1)
        valx.insert(0, 'Intercept', 1)
        valid['probbyx'] = result.predict(valx)

        scr.insert(tk.INSERT, result.summary())
        scr.insert(tk.INSERT, '\n\n')
        scr.insert(tk.INSERT, '\n\n')
        AUC = roc_auc_score(df1[dep_var], df1['probbyx'])
        Gini = (2 * AUC) - 1
        vAUC = roc_auc_score(valid[dep_var], valid['probbyx'])
        vGini = (2 * vAUC) - 1
        fpr, tpr, thresholds = roc_curve(df1[dep_var], df1.probbyx)
        vfpr, vtpr, vthresholds = roc_curve(valid[dep_var], valid.probbyx)
        plt.figure()
        plt.plot(fpr, tpr, label='Build Gini = %0.2f' % Gini)
        plt.plot(vfpr, vtpr, label='Valid Gini = %0.2f' % vGini)
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.savefig('Log_ROC')
        plt.title("Gini Plot")
        plt.show()
        scr.insert(tk.INSERT, '\n\n')

    else:
        del df1['tagxxc']
        scr.insert(
            tk.INSERT,
            'Alert!, Select Independent Variable(s) or check target is binary')
        scr.insert(tk.INSERT, '\n\n')
logreg = LogisticRegression()

rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

cols=['euribor3m', 'job_blue-collar', 'job_housemaid', 'marital_unknown', 'education_illiterate', 'default_no', 'default_unknown',
      'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
      'month_may', 'month_nov', 'month_oct', "poutcome_failure", "poutcome_success"]
X=os_data_X[cols]
y=os_data_y['y']

# Implementing the model
import statsmodels.api as sm
logit_model=sm.Logit(y,X)

result=logit_model.fit()
print(result.summary2())

cols=['euribor3m', 'job_blue-collar', 'job_housemaid', 'marital_unknown', 'education_illiterate',
      'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
      'month_may', 'month_nov', 'month_oct', "poutcome_failure", "poutcome_success"]
X=os_data_X[cols]
y=os_data_y['y']
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

# Regression model fitting
from sklearn.linear_model import LogisticRegression
Пример #4
0
data = df[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_2':])
print(data.head())
#    admit  gre   gpa  prestige_2  prestige_3  prestige_4
# 0      0  380  3.61           0           1           0
# 1      1  660  3.67           0           1           0
# 2      1  800  4.00           0           0           0
# 3      1  640  3.19           0           0           1
# 4      0  520  2.93           0           0           1

# manually add the intercept
data['intercept'] = 1.0

train_cols = data.columns[1:]
# Index([gre, gpa, prestige_2, prestige_3, prestige_4], dtype=object)

logit = sm.Logit(data['admit'], data[train_cols])

# fit the model
result = logit.fit()



# cool enough to deserve it's own gist
print(result.summary())

# odds ratios only
print(np.exp(result.params))
# gre           1.002267
# gpa           2.234545
# prestige_2    0.508931
# prestige_3    0.261792
Пример #5
0
#Print out statistics for eah column
summary_statistics = X.describe()

# Splitting the dataset randomly into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

##Importing statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

#Calling an instance of the logit model
logit_model = sm.Logit(y_train, X_train)

#Fit the logit model
result = logit_model.fit(method='bfgs', maxiter=num_iterations)
print(result.summary())

y_predict = result.predict(X_test)

#Calculate model metrics
from sklearn.metrics import confusion_matrix
decision_cutoffs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
print(
    "Calculating sensitivity (true pos rate), specificity/selectivity (true neg rate), false pos rate, false neg rate, and accuracy"
)
for cutoff in decision_cutoffs:
    tn, fp, fn, tp = confusion_matrix(y_test, (y_predict > cutoff) * 1).ravel()
Пример #6
0
os_data_X,os_data_y=os.fit_sample(df_train_x, df_train_y["Interest"])
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of none interesting matches in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of interesting matches",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

logreg = LogisticRegression(solver="lbfgs", max_iter=9999)

rfe = RFE(logreg, 5)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)
print((df_train_x.T[rfe.support_]).T)

df_train_x_rfe = df_train_x.T[rfe.support_].T.drop(columns=['HTR'])
df_test_x_rfe = df_test_x.T[rfe.support_].T.drop(columns=['HTR'])

logit_model=sm.Logit(df_train_y["Interest"],df_train_x_rfe)
result=logit_model.fit()
print(result.summary2())

logreg.fit(df_train_x_rfe, df_train_y["Interest"])
y_pred = logreg.predict(df_test_x_rfe)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(df_test_x_rfe, df_test_y["Interest"])))
confusion_matrix = confusion_matrix(df_test_y["Interest"], y_pred)
print(confusion_matrix)
#------------------------------------------------------
def analysis(houses: pd.DataFrame) -> None:
    """We remove columns that were replaced by dummy variables
    and others just used for visualization.
    We use RFE to create a model and test how that works out. 
    We then create another model based off the variables were statistically significant.
    """
    """
    #Me just trying to fit the data without any outside influences
    f= f'SELLER_HOUSE ~ SQFT_PER + PRICE + C(LOCATION)'    
    result= smf.logit(formula= str(f), data= houses).fit()
    print(result.summary2())
    y= ['SELLER_HOUSE']
    x= ['SQFT_PER', 'PRICE', 'LOC_699 - Not Defined', 'LOC_AA - Airport Area', 'LOC_CG - Columbus Grove',
       'LOC_CV - Cypress Village', 'LOC_EASTW - Eastwood', 'LOC_EC - El Camino Real', 'LOC_GP - Great Park',
       'LOC_IRSP - Irvine Spectrum', 'LOC_LGA - Laguna Altura', 'LOC_NK - Northpark', 'LOC_NW - Northwood', 
        'LOC_OC - Oak Creek', 'LOC_OH - Orchard Hills', 'LOC_OT - Orangetree', 'LOC_PS - Portola Springs', 
        'LOC_QH - Quail Hill', 'LOC_SH - Shady Canyon', 'LOC_SJ - Rancho San Joaquin', 'LOC_STG - Stonegate', 
        'LOC_Stonegate', 'LOC_TR - Turtle Rock', 'LOC_TRG - Turtle Ridge', 'LOC_UP - University Park',
       'LOC_UT - University Town Center', 'LOC_WB - Woodbridge', 'LOC_WD - Woodbury', 
        'LOC_WI - West Irvine', 'LOC_WN - Walnut (Irvine)', 'LOC_WP - Westpark']
    x_train, x_test, y_train, y_test= train_test_split(houses[x], houses[y], test_size= 0.3, random_state= 500)
    logreg = LogisticRegression()
    logreg.fit(x_train, y_train.values.ravel())
    y_pred= logreg.predict(x_test)
    print('Accuracy of logistic regression classifier on test set:', round(logreg.score(x_test, y_test), 3))
    # This model is really bad
    
    """

    ""
    houses = houses.drop([
        'DAYS_ON_MARKET', 'ADDRESS', 'LOCATION', 'STATUS', 'PROPERTY_TYPE',
        'ZIP_CODE'
    ],
                         axis=1)
    columns = houses.columns.values.tolist()
    y = ['SELLER_HOUSE']
    x = [i for i in columns if i not in y]

    # Over Sampling Using SMOTE
    x_train, _, y_train, _ = train_test_split(houses[x],
                                              houses[y],
                                              test_size=0.3,
                                              random_state=500)
    x_columns = x_train.columns

    os = SMOTE(random_state=0)
    os_x, os_y = os.fit_sample(x_train, y_train)
    os_x = pd.DataFrame(data=os_x, columns=x_columns)
    os_y = pd.DataFrame(data=os_y, columns=y)

    #Recursive Feature Elimination
    logreg = LogisticRegression(max_iter=600)
    rfe = RFE(logreg, 20)
    rfe = rfe.fit(os_x, os_y.values.ravel())

    lst = [i for count, i in enumerate(x) if rfe.support_[count] == True]
    X = os_x[lst]
    Y = os_y['SELLER_HOUSE']

    #logit_model= sm.Logit(Y, X)
    #result= logit_model.fit()
    #print(result.summary2())    # Model choosen by RCE

    #These are features have a p-value less than 0.05
    final_x = [
        'BATHS', 'ZIP_92602.0', 'ZIP_92618.0', 'LOC_699 - Not Defined',
        'LOC_TR - Turtle Rock', 'LOC_WD - Woodbury'
    ]
    #final_x= ['ZIP_92602.0', 'LOC_699 - Not Defined', 'LOC_TR - Turtle Rock', 'LOC_WD - Woodbury']
    X2 = os_x[final_x]

    logit_model2 = sm.Logit(Y, X2)
    result2 = logit_model2.fit()
    print(result2.summary2())  # Final Model

    x_train2, x_test2, y_train2, y_test2 = train_test_split(X2,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=500)
    logreg = LogisticRegression()
    logreg.fit(x_train2, y_train2)

    y_pred = logreg.predict(x_test2)
    print('Accuracy of logistic regression classifier on test set:',
          round(logreg.score(x_test2, y_test2), 2))

    conf_matrix = confusion_matrix(y_test2, y_pred)
    print(conf_matrix)
    # So 22+61 correct predictions and 13+44 wrong predictions

    logit_roc_auc = roc_auc_score(y_test2, logreg.predict(x_test2))
    fpr, tpr, _ = roc_curve(y_test2, logreg.predict_proba(x_test2)[:, 1])
    plt.figure()
    plt.plot(fpr,
             tpr,
             label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    ""
Пример #8
0
z_score, p_val_z = sm.stats.proportions_ztest(count, N, alternative='larger')

if p_val_z > 0.05:
    print("Test fails based on hypothesis test")
else:
    print("Test succeeds based on hypothesis test")

## Logistic regression model can also be applied to this questions##

# generate new variables
df2['intercept'] = 1
df2[['ab_page', 'ab_drop']] = pd.get_dummies(df2['landing_page'])
df2.drop('ab_drop', axis=1, inplace=True)

# train model
logistics_model = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']])
train = logistics_model.fit()
print(train.summary())

# the P is larger than 0.05, so it proves the same result with hypothesis test

## Will it be related to the countries? #############################

df_country = pd.read_csv('countries.csv')
df3 = df2.merge(df_country, how='outer', on='user_id')  # merge data

# generate variables
df3[['UK', 'US']] = pd.get_dummies(df3.country)

# train the model
logistics_model_with_country = sm.Logit(
Пример #9
0
categorical_dummies = pd.get_dummies(ml[categorical_features])
continuous_features = ['AGE', 'NUMOCCS', 'MOD_YEAR', 'TRAV_SP', 'DR_HGT', 'DR_WGT', 'PREV_SUS', 'PREV_DWI', 'PREV_SPD']

# drop variables to avoid perfect separation: when one or more explanatory variables perfectly explains variation in the dependent variable
unrelated = ['SEX_Other', 'INJ_SEV_No Apparent Injury', 'INJ_SEV_Other', 'INJ_SEV_Suspected Minor Injury', 'INJ_SEV_Suspected Serious Injury', 'DEFORMED_Other', 'LGT_COND_Other', 'RELJCT2_Other']
reference = ['SEX_Female', 'REST_USE_None Used', 'VSURCOND_Dry', 'DEFORMED_Minor Damage', 'WEATHER_Clear', 'LGT_COND_Daylight', 'RELJCT2_Non-Junction', 'DR_DRINK_No Drinking', 'BODY_SIZE_Small']

ml_recoded = pd.concat([ml[continuous_features], categorical_dummies], axis=1)
ml_recoded = ml_recoded.drop(unrelated, axis=1)
ml_recoded = ml_recoded.drop(reference, axis=1)
ml_recoded = ml_recoded.fillna(ml_recoded.mean())

X = ml_recoded.drop('INJ_SEV_Fatal Injury', axis=1)
X = sm.tools.tools.add_constant(X)
y = ml_recoded['INJ_SEV_Fatal Injury']
logit = sm.Logit(y, X)
result = logit.fit()
print result.summary()

data = {k: np.exp(v) for k, v in result.params.iteritems()}
odds = pd.DataFrame(data.items(), columns=['Variable', 'Odds']).sort('Variable')
print odds

########################
##     CLASSIFIER     ##
########################

X, y = X.values, y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=35)
LR = LogisticRegression()
LR = LR.fit(X_train, y_train)
import pandas as pd
import statsmodels.api as sm
import numpy as np
import sys
filename = sys.argv[1]
data = np.loadtxt(filename, delimiter=',')
[nRow, nCol] = data.shape
data_train = data[:, 0:nCol - 2]
labels_train = data[:, nCol - 1]
logit = sm.Logit(labels_train, data_train)
result = logit.fit()
sys.stdout = open("/Users/b.behmardi/weight.txt", "w")
print result.params
Пример #11
0
import math
loansData = pd.read_csv(
    'https://github.com/Thinkful-Ed/curric-data-001-data-sets/raw/master/loans/loansData.csv'
)
loansData['Interest.Rate'] = loansData['Interest.Rate'].map(
    lambda x: round(float(x.rstrip('%')) / 100, 4))
loansData['Loan.Length'] = loansData['Loan.Length'].map(
    lambda x: int(x.rstrip(' months')))
loansData['FICO.Score'] = loansData['FICO.Range'].map(lambda x: int(x[:3]))
loansData['IR_TF'] = loansData['Interest.Rate'].map(lambda x: 1
                                                    if x > .12 else 0)
intercept = [1] * len(loansData)
loansData['Intercept'] = intercept
ind_vars = ['Intercept', 'Amount.Requested', 'FICO.Score']
df = loansData
logit = sm.Logit(df['IR_TF'], df[ind_vars])
result = logit.fit()
coeff = result.params
print(coeff)


def logistic_function(FicoScore, LoanAmount):
    prob = 1 / (
        1 + math.exp(coeff[0] + coeff[2] * FicoScore + coeff[1] * LoanAmount))
    if prob > 0.7:
        p = 1
    else:
        p = 0
    return prob, p

#Part 2 - Statistical Regression Analysis

df['Flag'] = df['Days Delinquent'] > 90
df['log_annual_income'] = np.log(df['Annual Income'])
df['log_loan_amount'] = np.log(df['Loan Amount'])
df['Loan Type'].unique()
df = pd.get_dummies(df, prefix='LoanType', columns=['Loan Type'])
# Independent variables 'X'
X = df[[
    'Age', 'log_annual_income', 'log_loan_amount', 'LoanType_Auto',
    'LoanType_Business', 'LoanType_Home'
]]
# Dependent variable 'y'
y = df['Flag']
results = sm.Logit(y, X).fit()  # Estimated model summary
results.summary()
results.summary2()
# Based on the logit results, we see that we do not have a sufficient
# model to predict whether or not a customer will be delinquent for over
# 90 days. When looking at the p-values shown in our results.summary2() output,
# we see that none of our independent variables are significant at the 0.05
# alpha level to predict if a customer will be flagged or not.

# Part 3 - Predictive Analytics with Machine Learning
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=2, random_state=1)
cluster_labels = kmeans_model.fit_predict(
    df[['log_annual_income', 'log_loan_amount']])
df['cluster'] = cluster_labels
Пример #13
0
linreg = sm.OLS(y, X).fit()

linreg.summary()

print(linreg.summary().tables[1])

X = sm.add_constant(X)

linreg2 = sm.OLS(y, X).fit()

linreg2.summary()

# Let us perform logistic regression with statmodels

logreg = sm.Logit(y, X[['const', 'balance']]).fit()

logreg.summary()

print(logreg.summary().tables[1])

logreg2 = sm.Logit(y, X).fit()

logreg2.summary()

# Let us perform the regressions with scikit-learn

X = default[['balance', 'income', 'd_student']]

sk_linreg = LinearRegression().fit(X, y)
Пример #14
0
# Para obtener los parametros
mod.intercept_
mod.coef_

# Para ver las probabilidades
proba = mod.predict_proba(X)
proba[0:5, ] # las primeras cinco probabilidades

# Para obtener las estimaciones
y_hat = mod.predict(X)
y_hat[0:5]

# Utilizando statmodels ---------------------------------------------

# Vamos a agregar la columna de 1 al inicio para intercepto
X_train = sm.add_constant(X)

mod2 = sm.Logit(y, X_train)
result = mod2.fit()

# printing the summary table 
result.summary()

# Para ver los coeficientes
coefficients = result.params
coefficients



Пример #15
0
y_pred_proba = logistic.predict_proba(X_LR)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y, y_pred_proba)
auc = metrics.roc_auc_score(y, y_pred_proba)
plt.plot(fpr, tpr, label="data 1, auc=" + str(auc))
plt.legend(loc=4)

## get summary information for logistic regression

import statsmodels.api as sm

X = pd.DataFrame(data=X_LR, index=None, columns=['A', 'B'])
X['intercept'] = 1.0

Y = finalDf['class'].copy()

logit1 = sm.Logit(Y, X)
#methods: bfgs lbfgs cg ncg minimize
logit1.fit(method='newton').summary()
logit1.fit(method='newton').summary2()
logit1.fit().params

### FIT THE MODEL WITH MACHINE LEARNING (MLP)

#import scipy.io as sio
import tensorflow.keras as kr

nn = [2, 10, 3, 10, 1]
model = kr.Sequential()
#model.add(kr.layers.Dense(nn[1],kernel_initializer='normal', activation='relu',input_dim=nn[0]))
#model.add(kr.layers.Dense(nn[-1],kernel_initializer='normal', activation='sigmoid'))
model.add(
X_train = insample_smo.drop(['Default Status'],axis=1)
y_train = insample_smo.loc[:, 'Default Status']
'''

# Multicolinearity check
vif = pd.DataFrame()
vif["features"] = X_train.columns
vif["VIF Factor"] = [
    variance_inflation_factor(X_train.values, i)
    for i in range(X_train.shape[1])
]

# Build models
'''Logistic Regression'''
# Logistic regression with stats
logit_model = sm.Logit(y_train, sm.add_constant(X_train)).fit()
print('ALL', logit_model.summary())

# Logistic regression with sklearn
log = LogisticRegression(random_state=0, solver='lbfgs')
modeloutcome(log, X_train, y_train, X_test, y_test)
searchthreshold(log, [0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
learningcurve(log, X_train, y_train)
log.coef_

# C_parameter to regularize
C_param = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
for c in C_param:
    log = LogisticRegression(random_state=0, C=c)
    print(c)
    print(modeloutcome(log, X_train, y_train, X_test, y_test))
Пример #17
0
"""

import numpy as np
import statsmodels.api as sm

# Load the data from Spector and Mazzeo (1980). Examples follow Greene's
# Econometric Analysis Ch. 21 (5th Edition).
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)

# Linear Probability Model using OLS
lpm_mod = sm.OLS(spector_data.endog, spector_data.exog)
lpm_res = lpm_mod.fit()

# Logit Model
logit_mod = sm.Logit(spector_data.endog, spector_data.exog)
logit_res = logit_mod.fit()

# Probit Model
probit_mod = sm.Probit(spector_data.endog, spector_data.exog)
probit_res = probit_mod.fit()

# This example is based on Greene Table 21.1 5th Edition
# Linear Model Parameters
print lpm_res.params
# Logit Model Parameters
print logit_res.params
# Probit Model Parameters
print probit_res.params
#.. print "Typo in Greene for Weibull, replaced with logWeibull or Gumbel"
#.. print "(Tentatively) Weibull Model"
Пример #18
0
import statsmodels.api as sm
from sklearn.metrics import roc_curve, auc
plt.rcParams["font.sans-serif"] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
if __name__ == '__main__':
    matplotlib.rcParams['axes.unicode_minus'] = False
    data = pd.read_csv('WoeData.csv')
    Y = data['SeriousDlqin2yrs']
    X = data.drop([
        'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome',
        'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
        'NumberOfDependents'
    ],
                  axis=1)
    X1 = sm.add_constant(X)
    logit = sm.Logit(Y, X1)
    result = logit.fit()
    print(result.params)
    test = pd.read_csv('TestWoeData.csv')
    Y_test = test['SeriousDlqin2yrs']
    X_test = test.drop([
        'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome',
        'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
        'NumberOfDependents'
    ],
                       axis=1)
    X3 = sm.add_constant(X_test)
    resu = result.predict(X3)
    fpr, tpr, threshold = roc_curve(Y_test, resu)
    rocauc = auc(fpr, tpr)
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
Пример #19
0
print(churn.pivot_table(['total_charges'], index=['churn', 'custserv_calls']))
print(
    churn.pivot_table(['total_charges'],
                      index=['churn'],
                      columns=['custserv_calls']))
print(churn.pivot_table(['total_charges'], index=['custserv_calls'], columns=['churn'], \
      aggfunc='mean', fill_value='NaN', margins=True))

# Fit a logistic regression model
dependent_variable = churn['churn01']
independent_variables = churn[[
    'account_length', 'custserv_calls', 'total_charges'
]]
independent_variables_with_constant = sm.add_constant(independent_variables,
                                                      prepend=True)
logit_model = sm.Logit(dependent_variable,
                       independent_variables_with_constant).fit()
#logit_model = smf.glm(output_variable, input_variables, family=sm.families.Binomial()).fit()
# print(logit_model.summary())
print("\nQuantities you can extract from the result:\n%s" % dir(logit_model))
print("\nCoefficients:\n%s" % logit_model.params)
print("\nCoefficient Std Errors:\n%s" % logit_model.bse)
#logit_marginal_effects = logit_model.get_margeff(method='dydx', at='overall')
#print(logit_marginal_effects.summary())

print(
    "\ninvlogit(-7.2205 + 0.0012*mean(account_length) + 0.4443*mean(custserv_calls) + 0.0729*mean(total_charges))"
)


def inverse_logit(model_formula):
    from math import exp
Пример #20
0
# b. 目标是使用 **statsmodels** 来拟合你在 **a.** 中指定的回归模型,以查看用户收到的不同页面是否存在显著的转化差异。但是,首先,你需要为这个截距创建一个列( 原文:column) ,并为每个用户收到的页面创建一个虚拟变量列。添加一个 **截距** 列,一个 **ab_page** 列,当用户接收 **treatment** 时为1, **control** 时为0。

# In[32]:

df2[['control', 'ab_page']] = pd.get_dummies(df['group'])
df2.drop('control', axis=1, inplace=True)
df2.head()

#
# c. 使用 **statsmodels** 导入你的回归模型。 实例化该模型,并使用你在 **b.** 中创建的2个列来拟合该模型,用来预测一个用户是否会发生转化。

# In[33]:

df2['intercept'] = 1
lm = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']])
results = lm.fit()
results.summary()

# d. 请在下方提供你的模型摘要,并根据需要使用它来回答下面的问题。

# In[34]:

results.params

# e. 与 **ab_page** 关联的 p-值是多少? 为什么它与你在 **II** 中发现的结果不同?<br><br>  **提示**: 与你的回归模型相关的零假设与备择假设分别是什么?它们如何与 **Part II** 中的零假设和备择假设做比较?
#
# **该模型中 与 ab_page 关联的p值为0.190,逻辑回归中假设为:
# H0:Pnew = Pold
# H1:Pnew≠Pold
# 逻辑回归的零假设和备择假设与1部分的假设不同,零假设为旧页面和新页面的转换率相同,即自变量ab_page与反应变量converged可能无影响作用,备择假设为两者转换率不同,与零假设相反。之前计算的p值是用于单边检验的,而这个是双边检验,因此p值是有区别的,此处的p-值是指ab_page对应转换率的影响程度,p-值越小越具有显著性差异。
Пример #21
0
def logit_object():
    spector_data = sm.datasets.spector.load_pandas()
    spector_data.exog = sm.add_constant(spector_data.exog)
    logit_mod = sm.Logit(spector_data.endog, spector_data.exog)
    return logit_mod
Пример #22
0
    rfe.ranking_

    #identified columns Recursive Feature Elimination
    idc_rfe = pd.DataFrame({
        "rfe_support": rfe.support_,
        "columns": [i for i in X_train.columns],
        "ranking": rfe.ranking_,
    })

    cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()

    cols.extend(features.FlightPair.values.tolist())
    cols = list(set(cols))

    logit = sm.Logit(y[type], X[type].loc[:, cols])
    flogit = logit.fit()
    print(flogit.summary())

    coefficients = flogit.summary2().tables[1]
    coefficients = coefficients[coefficients['P>|z|'] < 0.1]
    coefficients['Odds Ratio'] = np.exp(coefficients['Coef.'])
    coefficients['O.R.LB'] = np.exp(coefficients['[0.025'])
    coefficients['O.R.UB'] = np.exp(coefficients['0.975]'])
    coefficients['Probability'] = coefficients['Odds Ratio'].round(
        1) * 0.5  #- 0.5
    coefficients['Probability'] = coefficients['Probability'].mask(
        coefficients['Probability'] >= 1, 0.99)
    coefficients.join(flight_durations)
    coefficients.to_csv('StatSigFlights' + month + '.csv', mode='a')
Пример #23
0

# n. What do the z-score and p-value you computed in the previous question mean for 
    #the conversion rates of the old and new pages?  Do they agree with the findings in parts **j.** and **k.**?
# >Since the z-score of 0.0949 does not exceed the critical value of 1.959963984540054, 
    #we keep the null hypothesis that the difference between the two proportions is no different
    #from zero. Since they are not different, we may decide to keep the experiment to run longer
# <a id='regression'></a>
# ### Part III - A regression approach
# 
# `1.` In this final part, you will see that the result you acheived in the previous A/B test can also be acheived by
    #performing regression.<br><br>
# 
# a. Since each row is either a conversion or no conversion, what type of regression should you be performing in this case?
    # >Logistic Regression

# b. The goal is to use **statsmodels** to fit the regression model you specified in part **a.** to see if there is a significant difference in conversion based on which page a customer receives.  However, you first need to create a colun for the intercept, and create a dummy variable column for which page each user received.  Add an **intercept** column, as well as an **ab_page** column, which is 1 when an individual receives the **treatment** and 0 if **control**.

# In[40]:

df2['intercept'] = 1
df2['ab_page'] = np.where(df2['group'] =='control',0,1)
    
    # c. Use **statsmodels** to import your regression model.  Instantiate the model, and fit the model using the two columns you created in part **b.** to predict whether or not an individual converts.

# In[41]:

lm = sm.Logit(df2['converted'],df2[['intercept','ab_page']])
r = lm.fit()

Пример #24
0
import seaborn as sb

enc = OneHotEncoder(handle_unknown='ignore')
colums = ["Productivity", "day", "day_of_the_week", "time_of_the_day"]#'timestamp', "day_of_the_week", "time_of_the_day",
          #"timestamp_local"]
with open("x_input", 'rb') as f:
    x_input = pickle.load(f)
all = x_input.copy()
y_output = [[i.pop(0)] for i in x_input]
#print
x_input = np.array(x_input)
y_output = np.array(y_output)

label = LabelEncoder()
x_input[:, 2] = label.fit_transform(x_input[:, 2])
x_input[:, 3] = label.fit_transform(x_input[:, 3])
ohe = OneHotEncoder(categorical_features = [2, 3], sparse=False)
x_input = ohe.fit_transform(x_input)
x_input = pd.DataFrame(data = x_input, index = range(len(x_input)))
y_output = pd.DataFrame(data = y_output, index = range(len(y_output)))
sb.heatmap(x_input.corr())

logreg = LogisticRegression()
rfe = RFE(logreg, 5)
rfe = rfe.fit(x_input, y_output )
print(rfe.support_)
print(rfe.ranking_)

logit_model=sm.Logit(y_output, x_input)
result=logit_model.fit()
print(result.summary())
# b. The goal is to use **statsmodels** to fit the regression model you specified in part **a.** to see if there is a significant difference in conversion based on which page a customer receives. However, you first need to create in df2 a column for the intercept, and create a dummy variable column for which page each user received.  Add an **intercept** column, as well as an **ab_page** column, which is 1 when an individual receives the **treatment** and 0 if **control**.

# In[36]:

#Creating dummy variable columns for landing page
df2['intercept'] = 1
df2[['ab_page', 'old_page']] = pd.get_dummies(df2['landing_page'])
df2.head()

# c. Use **statsmodels** to instantiate your regression model on the two columns you created in part b., then fit the model using the two columns you created in part **b.** to predict whether or not an individual converts.

# In[37]:

#Using Logit function for Logistic regression
log_mod = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']])
result = log_mod.fit()

# d. Provide the summary of your model below, and use it as necessary to answer the following questions.

# In[38]:

# result.summary2() is used inplace of result.summary() to avoid AttributeError: module 'scipy.stats' has no attribute 'chisqprob'
result.summary2()

# In[39]:

#Converting to proportional value
1 / np.exp(result.params)

# e. What is the p-value associated with **ab_page**? Why does it differ from the value you found in **Part II**?<br><br>  **Hint**: What are the null and alternative hypotheses associated with your regression model, and how do they compare to the null and alternative hypotheses in **Part II**?
Пример #26
0
               c='g',
               label='Unfavourable')
    ax.legend(loc='upper right')
    ax.set_xlabel("Plt peak")
    ax.set_ylabel("Plt trough")
    fig.savefig(os.path.join(outdir, "plt_peak_vs_trough.png"), dpi=200)

    # build a simple logistic regression model

    # add a constant and fit
    result, ci = logistic_regression(X, outcomes == 1)
    print result.summary()

    # plot the decision boundary for model with Plt peak and Plt trough (only)
    plt_dat = X.loc[:, ['Plt peak', 'Plt trough']]
    logit_model_plt = sm.Logit(outcomes == 1, sm.add_constant(plt_dat))
    result_plt = logit_model_plt.fit()
    coeff = result_plt.params
    intercept = -coeff['const'] / coeff['Plt peak']
    slope = -coeff['Plt trough'] / coeff['Plt peak']
    fit_x = np.linspace(peaks_dat.loc[:, 'Plt trough'].min(),
                        peaks_dat.loc[:, 'Plt trough'].max(), 20)
    fit_y = intercept + slope * fit_x

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(peaks_dat.loc[outcomes == 2, 'Plt trough'],
               peaks_dat.loc[outcomes == 2, 'Plt peak'],
               c='b',
               label='Favourable')
    ax.scatter(peaks_dat.loc[outcomes == 1, 'Plt trough'],
Пример #27
0
        2 * np.dot(x.T, x))
    print(numdiff.approx_hess(xk, fun2, 1e-3, (y, x))[0] - 2 * np.dot(x.T, x))

    gt = (-x * 2 * (y - np.dot(x, [1, 2, 3]))[:, None])
    g = approx_fprime_cs((1, 2, 3), fun1, (y, x),
                         h=1.0e-20)  #.T   #this shouldn't be transposed
    gd = numdiff.approx_fprime((1, 2, 3), fun1, epsilon, (y, x))
    print(maxabs(g, gt))
    print(maxabs(gd, gt))

    import statsmodels.api as sm

    data = sm.datasets.spector.load(as_pandas=False)
    data.exog = sm.add_constant(data.exog, prepend=False)
    #mod = sm.Probit(data.endog, data.exog)
    mod = sm.Logit(data.endog, data.exog)
    #res = mod.fit(method="newton")
    test_params = [1, 0.25, 1.4, -7]
    loglike = mod.loglike
    score = mod.score
    hess = mod.hessian

    #cs doesn't work for Probit because special.ndtr doesn't support complex
    #maybe calculating ndtr for real and imag parts separately, if we need it
    #and if it still works in this case
    print('sm', score(test_params))
    print('fd', numdiff.approx_fprime(test_params, loglike, epsilon))
    print('cs', numdiff.approx_fprime_cs(test_params, loglike))
    print('sm', hess(test_params))
    print('fd', numdiff.approx_fprime(test_params, score, epsilon))
    print('cs', numdiff.approx_fprime_cs(test_params, score))
Пример #28
0
def annotation_lv_logistic_regression(anno, S_U_binary, af_bins):
	multiplier = 1
	permute = False
	# Get indices where annotation is observed
	observed_indices = np.isnan(anno) == False

	observed_anno = anno[observed_indices]
	observed_S_U_binary = S_U_binary[observed_indices]
	observed_af_bins = af_bins[observed_indices]
	if permute == True:
		num_ones = np.sum(observed_S_U_binary)
		pp = num_ones/len(observed_S_U_binary)
		observed_S_U_binary = np.random.binomial(1, pp, size=len(observed_S_U_binary))

	if np.var(observed_anno) == 0:
		test_info = {'beta': np.nan, 'beta_ub': np.nan, 'beta_lb': np.nan, 'pvalue': np.nan}
		return test_info

	observed_anno = (observed_anno - np.mean(observed_anno))/np.std(observed_anno)

	ys = []
	gs = []
	loaded_snps = np.where(observed_S_U_binary == 1.0)[0]
	ys.append([1]*len(loaded_snps))
	gs.append(observed_anno[loaded_snps])

	bin_counts = {}
	for snp_index in loaded_snps:
		snp_af_bin = observed_af_bins[snp_index]
		if snp_af_bin not in bin_counts:
			bin_counts[snp_af_bin] = 0
		bin_counts[snp_af_bin] = bin_counts[snp_af_bin] + 1

	for bin_num in bin_counts.keys():
		num_in_bin = bin_counts[bin_num]*multiplier
		null_indices = np.where((observed_S_U_binary != 1.0) & (observed_af_bins == bin_num))[0]
		if len(null_indices) < num_in_bin:
			print('assumption eororr')
			pdb.set_trace()
		randomly_sampled_null_indices = np.random.choice(null_indices, size=num_in_bin,replace=False)
		ys.append([0]*len(randomly_sampled_null_indices))
		gs.append(observed_anno[randomly_sampled_null_indices])
	ys = np.hstack(ys)
	gs = np.hstack(gs)
	# Standardize annotations
	# gs = (gs - np.mean(gs))/np.std(gs)
	try:
		model = sm.Logit(ys, sm.add_constant(gs))
		res = model.fit()
		ci = res.conf_int()[1,:]
		beta_lb = ci[0]
		beta_ub = ci[1]
		beta = res.params[1]
		pvalue = res.pvalues[1]
		if res.mle_retvals['converged'] == False:
			beta_lb = np.nan
			beta_ub = np.nan
			beta = np.nan
			pvalue = np.nan
	except:
		pvalue = np.nan
		beta_lb = np.nan
		beta_ub = np.nan
		beta = np.nan
	test_info = {'beta': beta, 'beta_ub': beta_ub, 'beta_lb': beta_lb, 'pvalue': pvalue}
	return test_info
    x_pred = clr.predict(X)
    R2 = 1 - ((x_pred - x0)**2).sum() / ((x0 - x0.mean())**2).sum()
    vif = 1 / (1 - R2)
    if vif > 10:
        print("Warning: the vif for {0} is {1}".format(var_IV_sortet_2[i],
                                                       vif))

#########################
# Step 5: 应用逻辑回归模型#
#########################
multi_analysis = [i + '_WOE' for i in var_IV_sortet_2]
y = trainData['target']
X = trainData[multi_analysis].copy()
X['intercept'] = [1] * X.shape[0]

LR = sm.Logit(y, X).fit()
summary = LR.summary2()
pvals = LR.pvalues.to_dict()
params = LR.params.to_dict()

#发现有变量不显著,因此需要单独检验显著性
varLargeP = {k: v for k, v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d: d[1], reverse=True)
varLargeP = [i[0] for i in varLargeP]
p_value_list = {}
for var in varLargeP:
    X_temp = trainData[var].copy().to_frame()
    X_temp['intercept'] = [1] * X_temp.shape[0]
    LR = sm.Logit(y, X_temp).fit()
    p_value_list[var] = LR.pvalues[var]
for k, v in p_value_list.items():
def perform_logit(df, dv, ivs):
    logit = sm.Logit(df[dv], df[ivs])
    result = logit.fit()
    return result