def basic_significance(dataframe, list_to_dummify, target): ''' fits a non-regularized logistic model to target using dataframe predictors prints model accuracy and outputs significant coefficients order by absolute magnitude ---------- list_to_dummify: a list of columns in string format that require dummification before modeling ''' #process the dataframe df = dataframe.copy() df = dummify_columns(df, list_to_dummify) X, y = xy_split(df, target) X = add_constant(X) #fit the model logit = Logit(y, X) fitted_logit = Logit.fit(logit) #store accuracy c_mat = confusion_matrix( y, np.round(Logit.predict(logit, fitted_logit.params))) accuracy = sum(c_mat.diagonal()) / np.sum(c_mat) print('model train accuracy: %s' % (accuracy)) #store significant coefs coefs = pd.DataFrame(fitted_logit.pvalues[fitted_logit.pvalues < 0.05]) coefs['coefs'] = fitted_logit.params.filter(items=coefs.index) coefs.columns = ['p-values', 'coefs'] coefs['abs_coefs'] = np.abs(coefs.coefs) coefs = coefs.sort_values(by='abs_coefs', ascending=False) coefs = coefs.drop('abs_coefs', axis=1) return fitted_logit, coefs
def score(df): X, y = get_X_y(df) vif = variance_inflation_factor print('VIF: ') for i in range(X.shape[1]): print(vif(X, i)) X = add_constant(X) model = Logit(y, X).fit() print(model.summary(xname=names)) kfold = KFold(n_splits=5) accuracies = [] precisions = [] recalls = [] for train_index, test_index in kfold.split(X): model = LogisticRegression(solver="lbfgs") model.fit(X[train_index], y[train_index]) y_predict = model.predict(X[test_index]) y_true = y[test_index] accuracies.append(accuracy_score(y_true, y_predict)) precisions.append(precision_score(y_true, y_predict)) recalls.append(recall_score(y_true, y_predict)) print("Accuracy:", np.average(accuracies)) print("Precision:", np.average(precisions)) print("Recall:", np.average(recalls))
def fit_logit(self): ''' Takes in DF and does logistic regression for X vs Y Prints out baseline mode model diagnostics and predicted model diagnostics and ROC curve Returns SMOTE X and y values ''' self.y = self.df['repeat'].values self.X = self.df.drop(['repeat', 'CustomerNo'], axis=1).values #smote the data self.X_smote, self.y_smote = smote(self.X, self.y, 0.5) self.X_const = add_constant(self.X_smote, prepend=True) logit_model = Logit(self.y_smote, self.X_const).fit() print(logit_model.summary()) y_predict = logit_model.predict(self.X_const) #check a baseline model that is just the mode assigned to each indivs mode_model_acc, mode_model_precision, mode_model_recall = self.mode_cross_val( self.X_smote, self.y_smote) print("ModelAccuracy: {}, ModelPrecision: {}, ModelRecall: {}".format( mode_model_acc, mode_model_precision, mode_model_recall)) model_acc, model_precision, model_recall = self.logit_cross_val( self.X_smote, self.y_smote) print("ModelAccuracy: {}, ModelPrecision: {}, ModelRecall: {}".format( model_acc, model_precision, model_recall)) return self.X_smote, self.y_smote
def logit_reg(): X_smoted, X_test, y_smoted, y_test = prep_X_y(df, constant=True) lm = Logit(y_smoted, X_smoted).fit(method = 'powell') y_pred = lm.predict(X_test).round(0) print 'Statsmodels Logit Regression--------------------------------' print 'Confusion Matrix:', confusion_matrix(y_test, y_pred) print 'Accuracy:', accuracy_score(y_test, y_pred) print 'Precision:', precision_score(y_test, y_pred) print 'Recall:', recall_score(y_test, y_pred) return lm
def logit_reg(): X_smoted, X_test, y_smoted, y_test = prep_X_y(df, constant=True) lm = Logit(y_smoted, X_smoted).fit(method='powell') y_pred = lm.predict(X_test).round(0) print 'Statsmodels Logit Regression--------------------------------' print 'Confusion Matrix:', confusion_matrix(y_test, y_pred) print 'Accuracy:', accuracy_score(y_test, y_pred) print 'Precision:', precision_score(y_test, y_pred) print 'Recall:', recall_score(y_test, y_pred) return lm
class LogReg: def __init__(self): self.coef_=None def fit(self,X,y): X=add_constant(X) self.lr=Logit(y,X) self.l_fitted=self.lr.fit() self.coef_=self.l_fitted.params[:-1] def predict(self,X): if self.coef_ is None: print('you must first fit the model') return X=add_constant(X) return(self.lr.predict(self.l_fitted.params,X))
# plot auc = ax.plot(fpr, tpr, 'b', label='Val AUC = %0.3f' % roc_auc) plt.legend(loc='lower right') ax.plot([0, 1], [0, 1], 'r--') ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_ylabel('True Positive Rate') ax.set_xlabel('False Positive Rate') return auc #%% if __name__ == '__main__': from statsmodels.discrete.discrete_model import Logit import numpy as np import pandas as pd from sklearn.metrics import roc_auc_score train = pd.read_csv(r'./train.csv') logit_model = Logit(train['target'], train.iloc[:, 2:-1]) logit_model = logit_model.fit( disp=0) # disp=0 Don't show convergence message. predsLog = logit_model.predict(train.iloc[:, 2:-1]) #%% fig_1 = plt.figure(figsize=(6, 6)) aucplot(train['target'], predsLog) plt.show()
# Model Evaluation ------------------------------------------------------------- for ds_name, ds in zip(dataset_names, datasets): X_train, y_train = ds # Decision Tree dt = DecisionTreeClassifier() dt.fit(X_train, y_train) y_pred = dt.predict(X_test) score = geometric_mean_score(y_test, y_pred) score = round(score, 3) print("{} Dataset with Decision Tree: {}".format(ds_name, score)) # Logistic Regression logit = Logit(endog=y_train, exog=X_train) result = logit.fit(maxiter=1000, disp=0) y_pred = logit.predict(params=result.params, exog=X_test).round() score = geometric_mean_score(y_test, y_pred) score = round(score, 3) print("{} Dataset with Logistic Regression: {}".format(ds_name, score)) # Random Forest rf = RandomForestClassifier(n_estimators=200) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) score = geometric_mean_score(y_test, y_pred) score = round(score, 3) print("{} Dataset with Random Forest: {}".format(ds_name, score)) # Check effect of each variable ================================================ # Logistic Regression (Coefficients) logit = Logit(endog=y_rus, exog=X_rus)
# Logistic regression using 'statsmodels.discrete.discrete_model.Logit' # 'Logit' expects data in a different format; values of 'y' must be integers # Make dataset y_train_sm = [0 if label == 'B' else 1 for label in y_train] y_train_sm = pd.Series(y_train_sm) y_test_sm = [0 if label == 'B' else 1 for label in y_test] y_test_sm = pd.Series(y_test_sm) # Instantiate a Logistic Regression classifier (using statsmodels) logit = Logit(endog=y_train_sm, exog=X_train) result = logit.fit() print('- Logistic regression result using statsmodels:\n', result.summary()) # Predict probabilities of train & test sets y_train_sm_prob = logit.predict(params=result.params, exog=X_train) y_test_sm_prob = logit.predict(params=result.params, exog=X_test) # Plot predicted probabilities against train set plt.figure(1) plt.scatter( x=range(X_train.shape[0]), y=y_train_sm_prob, color=['red' if prob >= 0.5 else 'blue' for prob in y_train_sm_prob], alpha=0.5) plt.title('Sigmoid values for train data') plt.xlabel('observation number') plt.ylabel('probability') plt.show() # Plot predicted probabilities against train set
#%% from statsmodels.discrete.discrete_model import Logit import numpy as np import pandas as pd from sklearn.metrics import roc_auc_score import os dt = datetime.now().strftime('%m-%d-%y-%H') #%% LOAD DATA train = pd.read_csv(r'./train.csv') #%% logit_model = Logit(train['target'], train.iloc[:, 2:-1]) logit_model = logit_model.fit(disp=0) # disp=0 Don't show convergence message. #%% predsLog = logit_model.predict(train.iloc[:, 2:-1]) predsLog_auc = roc_auc_score(train['target'], predsLog) print('========================================================') print('Val_AUC = ', round(predsLog_auc, 5)) print('========================================================') #%% predict test test = pd.read_csv(r'./test.csv') predsLog2 = logit_model.predict(test.iloc[:, 1:-1]) #%% output testoutput = pd.DataFrame() testoutput['ID_code'] = test['ID_code'].copy() testoutput['target'] = predsLog2 testoutput.to_csv(r'./logistic_out_%s.csv' % dt, index=False) #%% from datetime import datetime