# 最大的VIF是1.32267733123,因此这一步认为没有多重共线性 multi_analysis = multi_analysis_vars_1 #%% ''' 第六步:逻辑回归模型。 要求: 1,变量显著 2,符号为负 ''' ### (1)将多变量分析的后变量带入LR模型中 y = trainData['y'] X = trainData[multi_analysis] X['intercept'] = [1] * X.shape[0] LR = sm.Logit(y, X).fit() summary = LR.summary() print(summary) pvals = LR.pvalues pvals = pvals.to_dict() print(pvals) # 有些变量不显著,需要逐步剔除 varLargeP = {k: v for k, v in pvals.items() if v >= 0.1} varLargeP = sorted(varLargeP.items(), key=lambda d: d[1], reverse=True) while (len(varLargeP) > 0 and len(multi_analysis) > 0): # 每次迭代中,剔除最不显著的变量,直到 # (1) 剩余所有变量均显著 # (2) 没有特征可选 varMaxP = varLargeP[0][0] print(varMaxP)
def fkitbutton(): global df1 global valid val1 = combo1.get() dep_var = val1.split(" ") #drop big ctegorical for col in list(df1.select_dtypes(include=['object']).columns): if len(df1[col].unique()) >= 10: df1.drop(col, inplace=True, axis=1) valid.drop(col, inplace=True, axis=1) df1 = df1.drop(['probbyx'], axis=1, errors='ignore') #drop coorelated col_corr = set() # Set of all the names of deleted columns corr_matrix = df1.corr() for i in range(len(corr_matrix.columns)): for j in range(i): if (abs(corr_matrix.iloc[i, j]) >= 0.8) and (corr_matrix.columns[j] not in col_corr): colname = corr_matrix.columns[i] # getting the name of column col_corr.add(colname) if colname in df1.columns: del df1[colname] # deleting the column from the dataset del valid[colname] scr.insert(tk.INSERT, "Correlation done ") iv() scr.insert(tk.INSERT, "IV calcs ") #remove iv keepcol = iv[(iv.IV >= 0.1)] keepcol1 = keepcol['VAR_NAME'] values = keepcol1.values.tolist() xy = val1 values.insert(0, xy) #only delete if exists while 'probbyx' in values: values.remove('probbyx') df1 = df1[values] valid = valid[values] scr.insert(tk.INSERT, "removedIV") #on hot encoding onehotencoding() df1 = df1.drop(['probbyx'], axis=1, errors='ignore') df1['tagxxc'] = df1[dep_var] if (df1.tagxxc.nunique() == 2): del df1['tagxxc'] ay = pd.DataFrame(df1[dep_var]) # alun = np.ravel(df1[dep_var]) ya = list(ay.columns) Xa = df1.drop(ya, 1) Xa.insert(0, 'Intercept', 1) # names = list(Xa.columns) scr.insert(tk.INSERT, '\n\n') # scr.insert(tk.INSERT, "Logistic Regression between " + val1 + " ~ " + val2 + # " : ") scr.insert(tk.INSERT, "Logistic Regression ") scr.insert(tk.INSERT, '\n\n') logit_model = sm.Logit(ay, Xa) result = logit_model.fit() df1['probbyx'] = result.predict(Xa) valx = valid.drop(ya, 1) valx.insert(0, 'Intercept', 1) valid['probbyx'] = result.predict(valx) scr.insert(tk.INSERT, result.summary()) scr.insert(tk.INSERT, '\n\n') scr.insert(tk.INSERT, '\n\n') AUC = roc_auc_score(df1[dep_var], df1['probbyx']) Gini = (2 * AUC) - 1 vAUC = roc_auc_score(valid[dep_var], valid['probbyx']) vGini = (2 * vAUC) - 1 fpr, tpr, thresholds = roc_curve(df1[dep_var], df1.probbyx) vfpr, vtpr, vthresholds = roc_curve(valid[dep_var], valid.probbyx) plt.figure() plt.plot(fpr, tpr, label='Build Gini = %0.2f' % Gini) plt.plot(vfpr, vtpr, label='Valid Gini = %0.2f' % vGini) plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig('Log_ROC') plt.title("Gini Plot") plt.show() scr.insert(tk.INSERT, '\n\n') else: del df1['tagxxc'] scr.insert( tk.INSERT, 'Alert!, Select Independent Variable(s) or check target is binary') scr.insert(tk.INSERT, '\n\n')
logreg = LogisticRegression() rfe = RFE(logreg, 20) rfe = rfe.fit(os_data_X, os_data_y.values.ravel()) print(rfe.support_) print(rfe.ranking_) cols=['euribor3m', 'job_blue-collar', 'job_housemaid', 'marital_unknown', 'education_illiterate', 'default_no', 'default_unknown', 'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', "poutcome_failure", "poutcome_success"] X=os_data_X[cols] y=os_data_y['y'] # Implementing the model import statsmodels.api as sm logit_model=sm.Logit(y,X) result=logit_model.fit() print(result.summary2()) cols=['euribor3m', 'job_blue-collar', 'job_housemaid', 'marital_unknown', 'education_illiterate', 'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', "poutcome_failure", "poutcome_success"] X=os_data_X[cols] y=os_data_y['y'] logit_model=sm.Logit(y,X) result=logit_model.fit() print(result.summary2()) # Regression model fitting from sklearn.linear_model import LogisticRegression
data = df[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_2':]) print(data.head()) # admit gre gpa prestige_2 prestige_3 prestige_4 # 0 0 380 3.61 0 1 0 # 1 1 660 3.67 0 1 0 # 2 1 800 4.00 0 0 0 # 3 1 640 3.19 0 0 1 # 4 0 520 2.93 0 0 1 # manually add the intercept data['intercept'] = 1.0 train_cols = data.columns[1:] # Index([gre, gpa, prestige_2, prestige_3, prestige_4], dtype=object) logit = sm.Logit(data['admit'], data[train_cols]) # fit the model result = logit.fit() # cool enough to deserve it's own gist print(result.summary()) # odds ratios only print(np.exp(result.params)) # gre 1.002267 # gpa 2.234545 # prestige_2 0.508931 # prestige_3 0.261792
#Print out statistics for eah column summary_statistics = X.describe() # Splitting the dataset randomly into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ##Importing statsmodels import statsmodels.api as sm import statsmodels.formula.api as smf #Calling an instance of the logit model logit_model = sm.Logit(y_train, X_train) #Fit the logit model result = logit_model.fit(method='bfgs', maxiter=num_iterations) print(result.summary()) y_predict = result.predict(X_test) #Calculate model metrics from sklearn.metrics import confusion_matrix decision_cutoffs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] print( "Calculating sensitivity (true pos rate), specificity/selectivity (true neg rate), false pos rate, false neg rate, and accuracy" ) for cutoff in decision_cutoffs: tn, fp, fn, tp = confusion_matrix(y_test, (y_predict > cutoff) * 1).ravel()
os_data_X,os_data_y=os.fit_sample(df_train_x, df_train_y["Interest"]) os_data_X = pd.DataFrame(data=os_data_X,columns=columns ) os_data_y= pd.DataFrame(data=os_data_y,columns=['y']) # we can Check the numbers of our data print("length of oversampled data is ",len(os_data_X)) print("Number of none interesting matches in oversampled data",len(os_data_y[os_data_y['y']==0])) print("Number of interesting matches",len(os_data_y[os_data_y['y']==1])) print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X)) print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X)) logreg = LogisticRegression(solver="lbfgs", max_iter=9999) rfe = RFE(logreg, 5) rfe = rfe.fit(os_data_X, os_data_y.values.ravel()) print(rfe.support_) print(rfe.ranking_) print((df_train_x.T[rfe.support_]).T) df_train_x_rfe = df_train_x.T[rfe.support_].T.drop(columns=['HTR']) df_test_x_rfe = df_test_x.T[rfe.support_].T.drop(columns=['HTR']) logit_model=sm.Logit(df_train_y["Interest"],df_train_x_rfe) result=logit_model.fit() print(result.summary2()) logreg.fit(df_train_x_rfe, df_train_y["Interest"]) y_pred = logreg.predict(df_test_x_rfe) print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(df_test_x_rfe, df_test_y["Interest"]))) confusion_matrix = confusion_matrix(df_test_y["Interest"], y_pred) print(confusion_matrix) #------------------------------------------------------
def analysis(houses: pd.DataFrame) -> None: """We remove columns that were replaced by dummy variables and others just used for visualization. We use RFE to create a model and test how that works out. We then create another model based off the variables were statistically significant. """ """ #Me just trying to fit the data without any outside influences f= f'SELLER_HOUSE ~ SQFT_PER + PRICE + C(LOCATION)' result= smf.logit(formula= str(f), data= houses).fit() print(result.summary2()) y= ['SELLER_HOUSE'] x= ['SQFT_PER', 'PRICE', 'LOC_699 - Not Defined', 'LOC_AA - Airport Area', 'LOC_CG - Columbus Grove', 'LOC_CV - Cypress Village', 'LOC_EASTW - Eastwood', 'LOC_EC - El Camino Real', 'LOC_GP - Great Park', 'LOC_IRSP - Irvine Spectrum', 'LOC_LGA - Laguna Altura', 'LOC_NK - Northpark', 'LOC_NW - Northwood', 'LOC_OC - Oak Creek', 'LOC_OH - Orchard Hills', 'LOC_OT - Orangetree', 'LOC_PS - Portola Springs', 'LOC_QH - Quail Hill', 'LOC_SH - Shady Canyon', 'LOC_SJ - Rancho San Joaquin', 'LOC_STG - Stonegate', 'LOC_Stonegate', 'LOC_TR - Turtle Rock', 'LOC_TRG - Turtle Ridge', 'LOC_UP - University Park', 'LOC_UT - University Town Center', 'LOC_WB - Woodbridge', 'LOC_WD - Woodbury', 'LOC_WI - West Irvine', 'LOC_WN - Walnut (Irvine)', 'LOC_WP - Westpark'] x_train, x_test, y_train, y_test= train_test_split(houses[x], houses[y], test_size= 0.3, random_state= 500) logreg = LogisticRegression() logreg.fit(x_train, y_train.values.ravel()) y_pred= logreg.predict(x_test) print('Accuracy of logistic regression classifier on test set:', round(logreg.score(x_test, y_test), 3)) # This model is really bad """ "" houses = houses.drop([ 'DAYS_ON_MARKET', 'ADDRESS', 'LOCATION', 'STATUS', 'PROPERTY_TYPE', 'ZIP_CODE' ], axis=1) columns = houses.columns.values.tolist() y = ['SELLER_HOUSE'] x = [i for i in columns if i not in y] # Over Sampling Using SMOTE x_train, _, y_train, _ = train_test_split(houses[x], houses[y], test_size=0.3, random_state=500) x_columns = x_train.columns os = SMOTE(random_state=0) os_x, os_y = os.fit_sample(x_train, y_train) os_x = pd.DataFrame(data=os_x, columns=x_columns) os_y = pd.DataFrame(data=os_y, columns=y) #Recursive Feature Elimination logreg = LogisticRegression(max_iter=600) rfe = RFE(logreg, 20) rfe = rfe.fit(os_x, os_y.values.ravel()) lst = [i for count, i in enumerate(x) if rfe.support_[count] == True] X = os_x[lst] Y = os_y['SELLER_HOUSE'] #logit_model= sm.Logit(Y, X) #result= logit_model.fit() #print(result.summary2()) # Model choosen by RCE #These are features have a p-value less than 0.05 final_x = [ 'BATHS', 'ZIP_92602.0', 'ZIP_92618.0', 'LOC_699 - Not Defined', 'LOC_TR - Turtle Rock', 'LOC_WD - Woodbury' ] #final_x= ['ZIP_92602.0', 'LOC_699 - Not Defined', 'LOC_TR - Turtle Rock', 'LOC_WD - Woodbury'] X2 = os_x[final_x] logit_model2 = sm.Logit(Y, X2) result2 = logit_model2.fit() print(result2.summary2()) # Final Model x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, Y, test_size=0.3, random_state=500) logreg = LogisticRegression() logreg.fit(x_train2, y_train2) y_pred = logreg.predict(x_test2) print('Accuracy of logistic regression classifier on test set:', round(logreg.score(x_test2, y_test2), 2)) conf_matrix = confusion_matrix(y_test2, y_pred) print(conf_matrix) # So 22+61 correct predictions and 13+44 wrong predictions logit_roc_auc = roc_auc_score(y_test2, logreg.predict(x_test2)) fpr, tpr, _ = roc_curve(y_test2, logreg.predict_proba(x_test2)[:, 1]) plt.figure() plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc) plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show() ""
z_score, p_val_z = sm.stats.proportions_ztest(count, N, alternative='larger') if p_val_z > 0.05: print("Test fails based on hypothesis test") else: print("Test succeeds based on hypothesis test") ## Logistic regression model can also be applied to this questions## # generate new variables df2['intercept'] = 1 df2[['ab_page', 'ab_drop']] = pd.get_dummies(df2['landing_page']) df2.drop('ab_drop', axis=1, inplace=True) # train model logistics_model = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']]) train = logistics_model.fit() print(train.summary()) # the P is larger than 0.05, so it proves the same result with hypothesis test ## Will it be related to the countries? ############################# df_country = pd.read_csv('countries.csv') df3 = df2.merge(df_country, how='outer', on='user_id') # merge data # generate variables df3[['UK', 'US']] = pd.get_dummies(df3.country) # train the model logistics_model_with_country = sm.Logit(
categorical_dummies = pd.get_dummies(ml[categorical_features]) continuous_features = ['AGE', 'NUMOCCS', 'MOD_YEAR', 'TRAV_SP', 'DR_HGT', 'DR_WGT', 'PREV_SUS', 'PREV_DWI', 'PREV_SPD'] # drop variables to avoid perfect separation: when one or more explanatory variables perfectly explains variation in the dependent variable unrelated = ['SEX_Other', 'INJ_SEV_No Apparent Injury', 'INJ_SEV_Other', 'INJ_SEV_Suspected Minor Injury', 'INJ_SEV_Suspected Serious Injury', 'DEFORMED_Other', 'LGT_COND_Other', 'RELJCT2_Other'] reference = ['SEX_Female', 'REST_USE_None Used', 'VSURCOND_Dry', 'DEFORMED_Minor Damage', 'WEATHER_Clear', 'LGT_COND_Daylight', 'RELJCT2_Non-Junction', 'DR_DRINK_No Drinking', 'BODY_SIZE_Small'] ml_recoded = pd.concat([ml[continuous_features], categorical_dummies], axis=1) ml_recoded = ml_recoded.drop(unrelated, axis=1) ml_recoded = ml_recoded.drop(reference, axis=1) ml_recoded = ml_recoded.fillna(ml_recoded.mean()) X = ml_recoded.drop('INJ_SEV_Fatal Injury', axis=1) X = sm.tools.tools.add_constant(X) y = ml_recoded['INJ_SEV_Fatal Injury'] logit = sm.Logit(y, X) result = logit.fit() print result.summary() data = {k: np.exp(v) for k, v in result.params.iteritems()} odds = pd.DataFrame(data.items(), columns=['Variable', 'Odds']).sort('Variable') print odds ######################## ## CLASSIFIER ## ######################## X, y = X.values, y.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=35) LR = LogisticRegression() LR = LR.fit(X_train, y_train)
import pandas as pd import statsmodels.api as sm import numpy as np import sys filename = sys.argv[1] data = np.loadtxt(filename, delimiter=',') [nRow, nCol] = data.shape data_train = data[:, 0:nCol - 2] labels_train = data[:, nCol - 1] logit = sm.Logit(labels_train, data_train) result = logit.fit() sys.stdout = open("/Users/b.behmardi/weight.txt", "w") print result.params
import math loansData = pd.read_csv( 'https://github.com/Thinkful-Ed/curric-data-001-data-sets/raw/master/loans/loansData.csv' ) loansData['Interest.Rate'] = loansData['Interest.Rate'].map( lambda x: round(float(x.rstrip('%')) / 100, 4)) loansData['Loan.Length'] = loansData['Loan.Length'].map( lambda x: int(x.rstrip(' months'))) loansData['FICO.Score'] = loansData['FICO.Range'].map(lambda x: int(x[:3])) loansData['IR_TF'] = loansData['Interest.Rate'].map(lambda x: 1 if x > .12 else 0) intercept = [1] * len(loansData) loansData['Intercept'] = intercept ind_vars = ['Intercept', 'Amount.Requested', 'FICO.Score'] df = loansData logit = sm.Logit(df['IR_TF'], df[ind_vars]) result = logit.fit() coeff = result.params print(coeff) def logistic_function(FicoScore, LoanAmount): prob = 1 / ( 1 + math.exp(coeff[0] + coeff[2] * FicoScore + coeff[1] * LoanAmount)) if prob > 0.7: p = 1 else: p = 0 return prob, p
#Part 2 - Statistical Regression Analysis df['Flag'] = df['Days Delinquent'] > 90 df['log_annual_income'] = np.log(df['Annual Income']) df['log_loan_amount'] = np.log(df['Loan Amount']) df['Loan Type'].unique() df = pd.get_dummies(df, prefix='LoanType', columns=['Loan Type']) # Independent variables 'X' X = df[[ 'Age', 'log_annual_income', 'log_loan_amount', 'LoanType_Auto', 'LoanType_Business', 'LoanType_Home' ]] # Dependent variable 'y' y = df['Flag'] results = sm.Logit(y, X).fit() # Estimated model summary results.summary() results.summary2() # Based on the logit results, we see that we do not have a sufficient # model to predict whether or not a customer will be delinquent for over # 90 days. When looking at the p-values shown in our results.summary2() output, # we see that none of our independent variables are significant at the 0.05 # alpha level to predict if a customer will be flagged or not. # Part 3 - Predictive Analytics with Machine Learning from sklearn.cluster import KMeans kmeans_model = KMeans(n_clusters=2, random_state=1) cluster_labels = kmeans_model.fit_predict( df[['log_annual_income', 'log_loan_amount']]) df['cluster'] = cluster_labels
linreg = sm.OLS(y, X).fit() linreg.summary() print(linreg.summary().tables[1]) X = sm.add_constant(X) linreg2 = sm.OLS(y, X).fit() linreg2.summary() # Let us perform logistic regression with statmodels logreg = sm.Logit(y, X[['const', 'balance']]).fit() logreg.summary() print(logreg.summary().tables[1]) logreg2 = sm.Logit(y, X).fit() logreg2.summary() # Let us perform the regressions with scikit-learn X = default[['balance', 'income', 'd_student']] sk_linreg = LinearRegression().fit(X, y)
# Para obtener los parametros mod.intercept_ mod.coef_ # Para ver las probabilidades proba = mod.predict_proba(X) proba[0:5, ] # las primeras cinco probabilidades # Para obtener las estimaciones y_hat = mod.predict(X) y_hat[0:5] # Utilizando statmodels --------------------------------------------- # Vamos a agregar la columna de 1 al inicio para intercepto X_train = sm.add_constant(X) mod2 = sm.Logit(y, X_train) result = mod2.fit() # printing the summary table result.summary() # Para ver los coeficientes coefficients = result.params coefficients
y_pred_proba = logistic.predict_proba(X_LR)[::, 1] fpr, tpr, _ = metrics.roc_curve(y, y_pred_proba) auc = metrics.roc_auc_score(y, y_pred_proba) plt.plot(fpr, tpr, label="data 1, auc=" + str(auc)) plt.legend(loc=4) ## get summary information for logistic regression import statsmodels.api as sm X = pd.DataFrame(data=X_LR, index=None, columns=['A', 'B']) X['intercept'] = 1.0 Y = finalDf['class'].copy() logit1 = sm.Logit(Y, X) #methods: bfgs lbfgs cg ncg minimize logit1.fit(method='newton').summary() logit1.fit(method='newton').summary2() logit1.fit().params ### FIT THE MODEL WITH MACHINE LEARNING (MLP) #import scipy.io as sio import tensorflow.keras as kr nn = [2, 10, 3, 10, 1] model = kr.Sequential() #model.add(kr.layers.Dense(nn[1],kernel_initializer='normal', activation='relu',input_dim=nn[0])) #model.add(kr.layers.Dense(nn[-1],kernel_initializer='normal', activation='sigmoid')) model.add(
X_train = insample_smo.drop(['Default Status'],axis=1) y_train = insample_smo.loc[:, 'Default Status'] ''' # Multicolinearity check vif = pd.DataFrame() vif["features"] = X_train.columns vif["VIF Factor"] = [ variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1]) ] # Build models '''Logistic Regression''' # Logistic regression with stats logit_model = sm.Logit(y_train, sm.add_constant(X_train)).fit() print('ALL', logit_model.summary()) # Logistic regression with sklearn log = LogisticRegression(random_state=0, solver='lbfgs') modeloutcome(log, X_train, y_train, X_test, y_test) searchthreshold(log, [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) learningcurve(log, X_train, y_train) log.coef_ # C_parameter to regularize C_param = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] for c in C_param: log = LogisticRegression(random_state=0, C=c) print(c) print(modeloutcome(log, X_train, y_train, X_test, y_test))
""" import numpy as np import statsmodels.api as sm # Load the data from Spector and Mazzeo (1980). Examples follow Greene's # Econometric Analysis Ch. 21 (5th Edition). spector_data = sm.datasets.spector.load() spector_data.exog = sm.add_constant(spector_data.exog, prepend=False) # Linear Probability Model using OLS lpm_mod = sm.OLS(spector_data.endog, spector_data.exog) lpm_res = lpm_mod.fit() # Logit Model logit_mod = sm.Logit(spector_data.endog, spector_data.exog) logit_res = logit_mod.fit() # Probit Model probit_mod = sm.Probit(spector_data.endog, spector_data.exog) probit_res = probit_mod.fit() # This example is based on Greene Table 21.1 5th Edition # Linear Model Parameters print lpm_res.params # Logit Model Parameters print logit_res.params # Probit Model Parameters print probit_res.params #.. print "Typo in Greene for Weibull, replaced with logWeibull or Gumbel" #.. print "(Tentatively) Weibull Model"
import statsmodels.api as sm from sklearn.metrics import roc_curve, auc plt.rcParams["font.sans-serif"] = 'SimHei' plt.rcParams['axes.unicode_minus'] = False if __name__ == '__main__': matplotlib.rcParams['axes.unicode_minus'] = False data = pd.read_csv('WoeData.csv') Y = data['SeriousDlqin2yrs'] X = data.drop([ 'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines', 'NumberOfDependents' ], axis=1) X1 = sm.add_constant(X) logit = sm.Logit(Y, X1) result = logit.fit() print(result.params) test = pd.read_csv('TestWoeData.csv') Y_test = test['SeriousDlqin2yrs'] X_test = test.drop([ 'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines', 'NumberOfDependents' ], axis=1) X3 = sm.add_constant(X_test) resu = result.predict(X3) fpr, tpr, threshold = roc_curve(Y_test, resu) rocauc = auc(fpr, tpr) plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
print(churn.pivot_table(['total_charges'], index=['churn', 'custserv_calls'])) print( churn.pivot_table(['total_charges'], index=['churn'], columns=['custserv_calls'])) print(churn.pivot_table(['total_charges'], index=['custserv_calls'], columns=['churn'], \ aggfunc='mean', fill_value='NaN', margins=True)) # Fit a logistic regression model dependent_variable = churn['churn01'] independent_variables = churn[[ 'account_length', 'custserv_calls', 'total_charges' ]] independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True) logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit() #logit_model = smf.glm(output_variable, input_variables, family=sm.families.Binomial()).fit() # print(logit_model.summary()) print("\nQuantities you can extract from the result:\n%s" % dir(logit_model)) print("\nCoefficients:\n%s" % logit_model.params) print("\nCoefficient Std Errors:\n%s" % logit_model.bse) #logit_marginal_effects = logit_model.get_margeff(method='dydx', at='overall') #print(logit_marginal_effects.summary()) print( "\ninvlogit(-7.2205 + 0.0012*mean(account_length) + 0.4443*mean(custserv_calls) + 0.0729*mean(total_charges))" ) def inverse_logit(model_formula): from math import exp
# b. 目标是使用 **statsmodels** 来拟合你在 **a.** 中指定的回归模型,以查看用户收到的不同页面是否存在显著的转化差异。但是,首先,你需要为这个截距创建一个列( 原文:column) ,并为每个用户收到的页面创建一个虚拟变量列。添加一个 **截距** 列,一个 **ab_page** 列,当用户接收 **treatment** 时为1, **control** 时为0。 # In[32]: df2[['control', 'ab_page']] = pd.get_dummies(df['group']) df2.drop('control', axis=1, inplace=True) df2.head() # # c. 使用 **statsmodels** 导入你的回归模型。 实例化该模型,并使用你在 **b.** 中创建的2个列来拟合该模型,用来预测一个用户是否会发生转化。 # In[33]: df2['intercept'] = 1 lm = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']]) results = lm.fit() results.summary() # d. 请在下方提供你的模型摘要,并根据需要使用它来回答下面的问题。 # In[34]: results.params # e. 与 **ab_page** 关联的 p-值是多少? 为什么它与你在 **II** 中发现的结果不同?<br><br> **提示**: 与你的回归模型相关的零假设与备择假设分别是什么?它们如何与 **Part II** 中的零假设和备择假设做比较? # # **该模型中 与 ab_page 关联的p值为0.190,逻辑回归中假设为: # H0:Pnew = Pold # H1:Pnew≠Pold # 逻辑回归的零假设和备择假设与1部分的假设不同,零假设为旧页面和新页面的转换率相同,即自变量ab_page与反应变量converged可能无影响作用,备择假设为两者转换率不同,与零假设相反。之前计算的p值是用于单边检验的,而这个是双边检验,因此p值是有区别的,此处的p-值是指ab_page对应转换率的影响程度,p-值越小越具有显著性差异。
def logit_object(): spector_data = sm.datasets.spector.load_pandas() spector_data.exog = sm.add_constant(spector_data.exog) logit_mod = sm.Logit(spector_data.endog, spector_data.exog) return logit_mod
rfe.ranking_ #identified columns Recursive Feature Elimination idc_rfe = pd.DataFrame({ "rfe_support": rfe.support_, "columns": [i for i in X_train.columns], "ranking": rfe.ranking_, }) cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist() cols.extend(features.FlightPair.values.tolist()) cols = list(set(cols)) logit = sm.Logit(y[type], X[type].loc[:, cols]) flogit = logit.fit() print(flogit.summary()) coefficients = flogit.summary2().tables[1] coefficients = coefficients[coefficients['P>|z|'] < 0.1] coefficients['Odds Ratio'] = np.exp(coefficients['Coef.']) coefficients['O.R.LB'] = np.exp(coefficients['[0.025']) coefficients['O.R.UB'] = np.exp(coefficients['0.975]']) coefficients['Probability'] = coefficients['Odds Ratio'].round( 1) * 0.5 #- 0.5 coefficients['Probability'] = coefficients['Probability'].mask( coefficients['Probability'] >= 1, 0.99) coefficients.join(flight_durations) coefficients.to_csv('StatSigFlights' + month + '.csv', mode='a')
# n. What do the z-score and p-value you computed in the previous question mean for #the conversion rates of the old and new pages? Do they agree with the findings in parts **j.** and **k.**? # >Since the z-score of 0.0949 does not exceed the critical value of 1.959963984540054, #we keep the null hypothesis that the difference between the two proportions is no different #from zero. Since they are not different, we may decide to keep the experiment to run longer # <a id='regression'></a> # ### Part III - A regression approach # # `1.` In this final part, you will see that the result you acheived in the previous A/B test can also be acheived by #performing regression.<br><br> # # a. Since each row is either a conversion or no conversion, what type of regression should you be performing in this case? # >Logistic Regression # b. The goal is to use **statsmodels** to fit the regression model you specified in part **a.** to see if there is a significant difference in conversion based on which page a customer receives. However, you first need to create a colun for the intercept, and create a dummy variable column for which page each user received. Add an **intercept** column, as well as an **ab_page** column, which is 1 when an individual receives the **treatment** and 0 if **control**. # In[40]: df2['intercept'] = 1 df2['ab_page'] = np.where(df2['group'] =='control',0,1) # c. Use **statsmodels** to import your regression model. Instantiate the model, and fit the model using the two columns you created in part **b.** to predict whether or not an individual converts. # In[41]: lm = sm.Logit(df2['converted'],df2[['intercept','ab_page']]) r = lm.fit()
import seaborn as sb enc = OneHotEncoder(handle_unknown='ignore') colums = ["Productivity", "day", "day_of_the_week", "time_of_the_day"]#'timestamp', "day_of_the_week", "time_of_the_day", #"timestamp_local"] with open("x_input", 'rb') as f: x_input = pickle.load(f) all = x_input.copy() y_output = [[i.pop(0)] for i in x_input] #print x_input = np.array(x_input) y_output = np.array(y_output) label = LabelEncoder() x_input[:, 2] = label.fit_transform(x_input[:, 2]) x_input[:, 3] = label.fit_transform(x_input[:, 3]) ohe = OneHotEncoder(categorical_features = [2, 3], sparse=False) x_input = ohe.fit_transform(x_input) x_input = pd.DataFrame(data = x_input, index = range(len(x_input))) y_output = pd.DataFrame(data = y_output, index = range(len(y_output))) sb.heatmap(x_input.corr()) logreg = LogisticRegression() rfe = RFE(logreg, 5) rfe = rfe.fit(x_input, y_output ) print(rfe.support_) print(rfe.ranking_) logit_model=sm.Logit(y_output, x_input) result=logit_model.fit() print(result.summary())
# b. The goal is to use **statsmodels** to fit the regression model you specified in part **a.** to see if there is a significant difference in conversion based on which page a customer receives. However, you first need to create in df2 a column for the intercept, and create a dummy variable column for which page each user received. Add an **intercept** column, as well as an **ab_page** column, which is 1 when an individual receives the **treatment** and 0 if **control**. # In[36]: #Creating dummy variable columns for landing page df2['intercept'] = 1 df2[['ab_page', 'old_page']] = pd.get_dummies(df2['landing_page']) df2.head() # c. Use **statsmodels** to instantiate your regression model on the two columns you created in part b., then fit the model using the two columns you created in part **b.** to predict whether or not an individual converts. # In[37]: #Using Logit function for Logistic regression log_mod = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']]) result = log_mod.fit() # d. Provide the summary of your model below, and use it as necessary to answer the following questions. # In[38]: # result.summary2() is used inplace of result.summary() to avoid AttributeError: module 'scipy.stats' has no attribute 'chisqprob' result.summary2() # In[39]: #Converting to proportional value 1 / np.exp(result.params) # e. What is the p-value associated with **ab_page**? Why does it differ from the value you found in **Part II**?<br><br> **Hint**: What are the null and alternative hypotheses associated with your regression model, and how do they compare to the null and alternative hypotheses in **Part II**?
c='g', label='Unfavourable') ax.legend(loc='upper right') ax.set_xlabel("Plt peak") ax.set_ylabel("Plt trough") fig.savefig(os.path.join(outdir, "plt_peak_vs_trough.png"), dpi=200) # build a simple logistic regression model # add a constant and fit result, ci = logistic_regression(X, outcomes == 1) print result.summary() # plot the decision boundary for model with Plt peak and Plt trough (only) plt_dat = X.loc[:, ['Plt peak', 'Plt trough']] logit_model_plt = sm.Logit(outcomes == 1, sm.add_constant(plt_dat)) result_plt = logit_model_plt.fit() coeff = result_plt.params intercept = -coeff['const'] / coeff['Plt peak'] slope = -coeff['Plt trough'] / coeff['Plt peak'] fit_x = np.linspace(peaks_dat.loc[:, 'Plt trough'].min(), peaks_dat.loc[:, 'Plt trough'].max(), 20) fit_y = intercept + slope * fit_x fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(peaks_dat.loc[outcomes == 2, 'Plt trough'], peaks_dat.loc[outcomes == 2, 'Plt peak'], c='b', label='Favourable') ax.scatter(peaks_dat.loc[outcomes == 1, 'Plt trough'],
2 * np.dot(x.T, x)) print(numdiff.approx_hess(xk, fun2, 1e-3, (y, x))[0] - 2 * np.dot(x.T, x)) gt = (-x * 2 * (y - np.dot(x, [1, 2, 3]))[:, None]) g = approx_fprime_cs((1, 2, 3), fun1, (y, x), h=1.0e-20) #.T #this shouldn't be transposed gd = numdiff.approx_fprime((1, 2, 3), fun1, epsilon, (y, x)) print(maxabs(g, gt)) print(maxabs(gd, gt)) import statsmodels.api as sm data = sm.datasets.spector.load(as_pandas=False) data.exog = sm.add_constant(data.exog, prepend=False) #mod = sm.Probit(data.endog, data.exog) mod = sm.Logit(data.endog, data.exog) #res = mod.fit(method="newton") test_params = [1, 0.25, 1.4, -7] loglike = mod.loglike score = mod.score hess = mod.hessian #cs doesn't work for Probit because special.ndtr doesn't support complex #maybe calculating ndtr for real and imag parts separately, if we need it #and if it still works in this case print('sm', score(test_params)) print('fd', numdiff.approx_fprime(test_params, loglike, epsilon)) print('cs', numdiff.approx_fprime_cs(test_params, loglike)) print('sm', hess(test_params)) print('fd', numdiff.approx_fprime(test_params, score, epsilon)) print('cs', numdiff.approx_fprime_cs(test_params, score))
def annotation_lv_logistic_regression(anno, S_U_binary, af_bins): multiplier = 1 permute = False # Get indices where annotation is observed observed_indices = np.isnan(anno) == False observed_anno = anno[observed_indices] observed_S_U_binary = S_U_binary[observed_indices] observed_af_bins = af_bins[observed_indices] if permute == True: num_ones = np.sum(observed_S_U_binary) pp = num_ones/len(observed_S_U_binary) observed_S_U_binary = np.random.binomial(1, pp, size=len(observed_S_U_binary)) if np.var(observed_anno) == 0: test_info = {'beta': np.nan, 'beta_ub': np.nan, 'beta_lb': np.nan, 'pvalue': np.nan} return test_info observed_anno = (observed_anno - np.mean(observed_anno))/np.std(observed_anno) ys = [] gs = [] loaded_snps = np.where(observed_S_U_binary == 1.0)[0] ys.append([1]*len(loaded_snps)) gs.append(observed_anno[loaded_snps]) bin_counts = {} for snp_index in loaded_snps: snp_af_bin = observed_af_bins[snp_index] if snp_af_bin not in bin_counts: bin_counts[snp_af_bin] = 0 bin_counts[snp_af_bin] = bin_counts[snp_af_bin] + 1 for bin_num in bin_counts.keys(): num_in_bin = bin_counts[bin_num]*multiplier null_indices = np.where((observed_S_U_binary != 1.0) & (observed_af_bins == bin_num))[0] if len(null_indices) < num_in_bin: print('assumption eororr') pdb.set_trace() randomly_sampled_null_indices = np.random.choice(null_indices, size=num_in_bin,replace=False) ys.append([0]*len(randomly_sampled_null_indices)) gs.append(observed_anno[randomly_sampled_null_indices]) ys = np.hstack(ys) gs = np.hstack(gs) # Standardize annotations # gs = (gs - np.mean(gs))/np.std(gs) try: model = sm.Logit(ys, sm.add_constant(gs)) res = model.fit() ci = res.conf_int()[1,:] beta_lb = ci[0] beta_ub = ci[1] beta = res.params[1] pvalue = res.pvalues[1] if res.mle_retvals['converged'] == False: beta_lb = np.nan beta_ub = np.nan beta = np.nan pvalue = np.nan except: pvalue = np.nan beta_lb = np.nan beta_ub = np.nan beta = np.nan test_info = {'beta': beta, 'beta_ub': beta_ub, 'beta_lb': beta_lb, 'pvalue': pvalue} return test_info
x_pred = clr.predict(X) R2 = 1 - ((x_pred - x0)**2).sum() / ((x0 - x0.mean())**2).sum() vif = 1 / (1 - R2) if vif > 10: print("Warning: the vif for {0} is {1}".format(var_IV_sortet_2[i], vif)) ######################### # Step 5: 应用逻辑回归模型# ######################### multi_analysis = [i + '_WOE' for i in var_IV_sortet_2] y = trainData['target'] X = trainData[multi_analysis].copy() X['intercept'] = [1] * X.shape[0] LR = sm.Logit(y, X).fit() summary = LR.summary2() pvals = LR.pvalues.to_dict() params = LR.params.to_dict() #发现有变量不显著,因此需要单独检验显著性 varLargeP = {k: v for k, v in pvals.items() if v >= 0.1} varLargeP = sorted(varLargeP.items(), key=lambda d: d[1], reverse=True) varLargeP = [i[0] for i in varLargeP] p_value_list = {} for var in varLargeP: X_temp = trainData[var].copy().to_frame() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() p_value_list[var] = LR.pvalues[var] for k, v in p_value_list.items():
def perform_logit(df, dv, ivs): logit = sm.Logit(df[dv], df[ivs]) result = logit.fit() return result