FullRaw = pd.concat([FullRaw, dummyDf], axis=1) FullRaw['smoker'] = np.where(FullRaw['smoker'] == 'No', 1, 0) from sklearn.model_selection import train_test_split Train, Test = train_test_split(FullRaw, test_size=0.3, random_state=123) Train_X = Train.drop(['smoker'], axis=1) Train_Y = Train['smoker'].copy() Test_X = Test.drop(['smoker'], axis=1) Test_Y = Test['smoker'].copy() from statsmodels.api import Logit M1_Model = Logit(Train_Y, Train_X).fit() M1_Model.summary() Test_pred = M1_Model.predict(Test_X) from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score Test['Test_prob'] = Test_pred Test['Test_Class'] = np.where(Test['Test_prob'] > 0.5, 1, 0) Con_Mat = confusion_matrix(Test['Test_Class'], Test_Y) sum(np.diag(Con_Mat)) / Test_Y.shape[0] * 100 from sklearn.metrics import roc_auc_score, roc_curve ROC = roc_auc_score(Test['Test_Class'], Test_Y) AUC = roc_curve(Test['Test_Class'], Test_Y)
from sklearn.metrics import classification_report print("Done ...") print("\n*** Recreate Train Data ***") dfX_train = dfTrain[allCols] y_train = dfTrain[clsVars].values print("Done ...") # model object print("\n*** Model ***") # add intercept manually dfX_train_const = add_constant(dfX_train) # build model and fit training data model = Logit(y_train, dfX_train_const).fit() # print the model summary print(model.summary()) print("Done ...") ################################ # Classification - Predict Train # evaluate : Accuracy & Confusion Metrics ############################### # Probability Distribution for train data prob_train = model.predict(dfX_train_const) # sort the prob dist for visualization sorted_train = sorted(prob_train.values) index_train = np.arange(len(sorted_train)) # plot it plt.figure()
print(Temp_Column_Name, ": ", Max_VIF) if (Max_VIF >= 10): # This condition will ensure that ONLY columns having VIF lower than 10 are NOT dropped print(Temp_Column_Name, Max_VIF) Train_X_Copy = Train_X_Copy.drop(Temp_Column_Name, axis = 1) High_VIF_Column_Names.extend(Temp_Column_Name) Train_x.drop(['Loan_Amount_Term','Self_Employed','Gender'], axis = 1, inplace=True) Test_x.drop(['Loan_Amount_Term','Self_Employed','Gender'], axis = 1, inplace=True) Train_x.shape Test_x.shape from statsmodels.api import Logit Model1 = Logit(Train_y, Train_x).fit() Model1.summary() col_names = ['ApplicantIncome','Dependents'] Model2 = Logit(Train_y,Train_x.drop( col_names, axis= 1)).fit() Model2.summary() Test_x.drop(['ApplicantIncome','Dependents'], axis = 1, inplace=True) Test_x['Predit'] = Model2.predict(Test_x) Test_x.columns Test_x['Predit'][0:6] import numpy as np Test_x['Test_class']=np.where(Test_x['Predit']>=0.5, 1, 0) import pandas as pd
class PropensityScore: """ Parameters ---------- outcome : str This should be the name of the binary variable to predict. test_vars : list A list of the variables to test. df : DataFrame The pandas DataFrame that contains all of the data. init_vars : str or list, optional Variables to always have included in the propensity score. The default is None. add_cons : Boolean, optional Select this to add a constant to model. The default is True. disp : Boolean, optional Display the final model including dropped variables. The default is True. cutoff_ord1 : Numeric, optional The log gain cutoff for first order covariates. The default is 1. cutoff_ord2 : Numeric, optional The log gain cutoff for second order covariates. The default is 2.71. t_strata : Numeric, optional The cutoff for the t-statistic for the calculated strata. The default is 1. n_min : {'n_min_strata':int1,'n_min_tc':int2} or 'auto' The minimum number of units in each strata or treated/control individuals in strata. The default is 'auto' in which case the number per strata is the number of covariates tested in the propensity score (just linear ones) + 2 (or K+2) while the minimum number of treated and control individuals per strata is 3. If not auto, the input needs to be a dictionary that explicitly specifies: {'n_min_strata':int1,'n_min_tc':int2} Raises ------ ValueError If variables are improperly defined, this prints out warnings. Returns ------- self.data : DataFrame This includes a new frame of just the outcome and potential covariates. self.dropped_vars : list The variables that did not make the cut for singularity reasons. self.model : sm.Logit.fit() model This is the raw model on the final set of variables from Statsmodels self.propscore : Series This is the propensity score as calculated by self.model.fittedvalues. This may not match dimension of data due to dropped missing values, but index will align properly. self.strata : Series The calculated strata. Missing propensity scores and values outside of min of treated group or max of control group are coded as NaN. self.logodds : Series The linearized propensity score. Will be the same dimension as propscore. self.test_vars_ord2: list The full list of tested second order variables for reference. self.trim_range : tuple The result of calculating the optimal trim min and max propensity score values. self.in_trim : Series (True/False) An array where True means that the propensity score falls within the trim min/max range. """ def __init__(self, outcome, test_vars, df, init_vars=None, add_cons=True, disp=True, cutoff_ord1=1, cutoff_ord2=2.71, t_strata=1, n_min='auto'): # double checking some inputs if type(outcome) != str: raise ValueError( 'y must be a string variable name in the DataFrame.') if type(test_vars) != list: raise ValueError('X must be a list of covariates to test.') self.outcome = outcome self.test_vars = test_vars self.add_cons = add_cons self.init_vars = init_vars if init_vars and type(init_vars) == str: covs = [init_vars] + test_vars elif init_vars and type(init_vars) == list: covs = init_vars + test_vars else: covs = test_vars if n_min == 'auto': n_min_strata = len(covs) + 2 n_min_tc = 3 else: if type(n_min) != dict: raise ValueError('n_min must be "auto" or a dictionary') elif ('n_min_tc' not in n_min) or ('n_min_strata' not in n_min): raise ValueError('Must specify both n_min_strata (ex. K+2) '\ 'and n_min_tc (ex. 3)') n_min_strata = n_min['n_min_strata'] n_min_tc = n_min['n_min_tc'] if 'propscore' in covs + [outcome] or 'logodds' in covs + [outcome]: raise ValueError( 'You cannot have variables labeled "propscore" or "logodds"') data = df[[outcome] + covs].copy() ord2_vars = [] dropped_vars = [] # looping through covariates for idx, cc in enumerate(covs): # first a gut check to make sure all the variables aren't singular if len(data[cc].dropna().unique()) == 1: raise ValueError('{} only takes on one value'.format(cc)) # for all variables generate the interaction terms if idx < len(covs): for jj in covs[idx + 1:]: testvar = data[cc] * data[jj] if (not testvar.equals(data[cc]) and not testvar.equals(data[jj]) and len(testvar.dropna().unique()) > 1): data.loc[:, 'X'.join([cc, jj])] = testvar ord2_vars.append('X'.join([cc, jj])) else: dropped_vars.append('X'.join([cc, jj])) # for continuous variables, generate squared term if not data[cc].equals(data[cc]**2): data.loc[:, '{}_sq'.format(cc)] = data[cc]**2 ord2_vars.append('{}_sq'.format(cc)) else: dropped_vars.append('{}_sq'.format(cc)) if add_cons: data.loc[:, '_cons'] = 1 self.data = data self.dropped_vars = dropped_vars self.test_vars_ord2 = ord2_vars # ===================================================================== # Actually calculating propensity score # ===================================================================== linear = self.model_from_group(self.test_vars, cutoff=cutoff_ord1, init_vars=self.init_vars) squared = self.model_from_group(ord2_vars, cutoff=cutoff_ord2, init_vars=linear) if add_cons: self.model = Logit(self.data[self.outcome], self.data[squared + ['_cons']], missing='drop').fit(disp=False) else: self.model = Logit(self.data[self.outcome], self.data[squared], missing='drop').fit(disp=False) self.logodds = self.model.fittedvalues.rename('logodds') self.propscore = Series(self.model.predict(), index=self.logodds.index, name='propscore') self.trim_range = self.calc_trim(self.propscore) self.in_trim = ( self.propscore.ge(self.trim_range[0]) & self.propscore.le(self.trim_range[1])).rename('in_trim') self.strata = self.stratify(self.data[self.outcome], self.logodds, t_max=t_strata, n_min_strata=n_min_strata, n_min_tc=n_min_tc) if disp: print(self.model.summary()) print('The following vars were infeasible: {}'.format(', '.join( self.dropped_vars))) print('Stratification produced {} strata'.format( len(self.strata.dropna().unique()))) def best_in_group(self, newvars, basevars=None): ''' Get the best variable for score among a set of new variables ''' if not basevars and self.add_cons: basevars = ['_cons'] elif basevars and self.add_cons: basevars = basevars + ['_cons'] elif not basevars and not self.add_cons: raise ValueError( 'Must specify at least one covariate for baseline model') origmod = Logit(self.data[self.outcome], self.data[basevars], missing='drop').fit(disp=False) list_llf = [] for cc in newvars: try: newmod = Logit(self.data[self.outcome], self.data[basevars + [cc]], missing='drop').fit(disp=False) if origmod.nobs / origmod.nobs < .95: warnings.warn('Using {} causes more than 5% '\ 'of the sample to be dropped'.format(cc)) list_llf.append(newmod.llf) except: if cc not in self.dropped_vars: self.dropped_vars.append(cc) list_llf.append(origmod.llf) idx = list_llf.index(max(list_llf)) return newvars[idx], 2 * (list_llf[idx] - origmod.llf) def model_from_group(self, test_vars, cutoff, init_vars=None): ''' Iterate through a list over and over until no more contribution ''' remaining = test_vars.copy() if init_vars and type(init_vars) == str: final = [init_vars].copy() init_vars = [init_vars] elif init_vars and type(init_vars) == list: final = init_vars.copy() else: final = [] while len(remaining) > 0: temp, gain_add = self.best_in_group(remaining, basevars=final) if gain_add > cutoff: final.append(temp) remaining.remove(temp) else: break return final # we will define a static method so that we can call this on any generic series @staticmethod def stratify(outcome, logodds, n_min_strata, n_min_tc=3, t_max=1): """ Calculate strata from a given outcome variable and log-odds. Specify the cutoff for the t-statistic in t_max, or the minimum number of observations for each strata in n_min_strata and the number of treated or control observations per strata in n_min_tc. Parameters ---------- outcome : Series Binary variable denoting treatment outcome logodds : Series The calculated log-odds for that (transformation of propensity score). n_min_strata : Int The minimum number of observations per strata. n_min_tc : Int The minimum number of treated or control observations per strata. Default is 3. t_max : Float The maximum t-statistic value acceptable in a strata before splitting. Default is 1. Returns ------- strata : Series The calculated strata. Missing propensity scores and values outside of min of treated group or max of control group are coded as NaN. """ if type(outcome) != Series or type(logodds) != Series: raise ValueError('Expecting pandas series as inputs') # helper function to facilitate indexing def above_med(x): return (x >= x.median()).astype(int) outcome = outcome.rename('outcome').to_frame() df = outcome.join(logodds) minmax = df.groupby('outcome')['logodds'].agg(['max', 'min']) df = df.loc[df.logodds.ge(minmax.loc[1, 'min']) & df.logodds.le(minmax.loc[0, 'max']) & df.logodds.notnull()] # initialize the strata, potential blocks, and the change while loop df.loc[:, 'strata'] = 0 df.loc[:, 'block'] = 0 change = True while change == True: # get the medians of the strata df.loc[:, 'medgrp'] = df.groupby('strata')['logodds'].apply(above_med) for ii in df.strata.unique(): # simplify the notation sub = df.loc[df.strata.eq(ii), :].copy() # calculate t-stat and a grouper with number of groups t_test = ttest(sub.loc[sub.outcome.eq(1), 'logodds'], sub.loc[sub.outcome.eq(0), 'logodds'], nan_policy='omit').statistic n = sub.groupby(['medgrp', 'outcome'])['logodds'].count() # make new blocks if (t_test > t_max and min(n) >= n_min_tc and min(n.groupby('medgrp').sum()) >= n_min_strata): df.loc[df.strata.eq(ii), 'block'] = df.loc[df.strata.eq(ii), 'medgrp'] if df.block.sum() == 0: change = False else: # getting ready for next loop df.strata = df.groupby(['strata', 'block']).ngroup() df.block = 0 return outcome.join(df.strata).strata # we will define a static method so that we can call this on any generic series @staticmethod def calc_trim(propscore): y = 1 / (propscore * (1 - propscore)) if y.max() <= (2 / y.count()) * (y.sum()): return 0, 1 for gamma in linspace(y.max(), 0, 10000): lhs_estimand = (gamma / y.count()) * (y.le(gamma).sum()) rhs_estimand = (2 / y.count()) * ((y.le(gamma) * y).sum()) if lhs_estimand < rhs_estimand: break alpha = .5 - ((.25 - (1 / gamma))**.5) return alpha, 1 - alpha
print(Temp_Column_Name, ":", Max_VIF) Train_X_Copy = Train_X_Copy.drop(Temp_Column_Name, axis=1) High_VIF_Col_Names.extend(Temp_Column_Name) counter = counter + 1 High_VIF_Col_Names Train_X = Train_X.drop(High_VIF_Col_Names, axis=1) Test_X = Test_X.drop(High_VIF_Col_Names, axis=1) from statsmodels.api import Logit M1 = Logit(Train_Y, Train_X).fit() M1.summary() Col_To_Drop = ['Academic_Qualification_Postgraduate'] M2 = Logit(Train_Y, Train_X.drop(Col_To_Drop, axis=1)).fit() M2.summary() Col_To_Drop.append('Marital_Unknown') M3 = Logit(Train_Y, Train_X.drop(Col_To_Drop, axis=1)).fit() M3.summary() Col_To_Drop.append('June_Bill_Amount') M4 = Logit(Train_Y, Train_X.drop(Col_To_Drop, axis=1)).fit() M4.summary() Col_To_Drop.append('Age_Years') M5 = Logit(Train_Y, Train_X.drop(Col_To_Drop, axis=1)).fit()
d = {'target': y_test} # In[25]: test_y = pd.DataFrame(data=d) test_y.head(10) # In[26]: from statsmodels.api import Logit # In[35]: rm.seed(123) SB_logit = Logit(y_train, x_train).fit() SB_logit.summary() # In[36]: SB_pred = SB_logit.predict(x_test) # In[37]: test_y['predicted'] = SB_pred test_y.head(10) # In[38]: test_y['pred_round'] = 1 # In[39]:
def main(): index = ["RIIPL_ID"] pop = CachePopulationSubsets(population, index) pop["ROW_ID"] = np.arange(len(pop)) outcomes = pd.read_csv(outcomes_file) words = pd.read_csv(words_file, index_col="WORD_ID") # Load counts and convert to CSR sparse matrix for efficient row slicing counts = mmread(counts_file).tocsr() # Further divide training data into training and validation sets for # selecting the optimal number of topics training = (pop["SUBSET"] == "TRAINING") np.random.seed(seed) subset = np.random.choice([True, False], len(training), p=[0.25, 0.75]) validation = (training & subset) training = (training & ~subset) print(training.sum(), "training") print(validation.sum(), "validation") # Create training and validation outcomes y_train = outcomes.loc[training, "OUTCOME_ANY"].values y_validate = outcomes.loc[validation, "OUTCOME_ANY"].values print(y_train.sum(), "training outcomes") print(y_validate.sum(), "validation outcomes") # Transform raw counts to TF-IDF using IDF from the training set training = np.where(training)[0] validation = np.where(validation)[0] counts_train = counts[training, :] tfidf = TfidfTransformer() tfidf.fit(counts_train) counts = tfidf.transform(counts) counts_train = counts[training, :] counts_validate = counts[validation, :] # Select NMF model with best AUC performance on validation data best = 0 best_auc = 0 nmfs = [] for i, n in enumerate(ntopics): print(n, "topics:") nmf = NMF(n, random_state=seed).fit(counts_train) nmfs.append(nmf) X_train = pd.DataFrame(nmf.transform(counts_train)) X_train["intercept"] = 1 logit = Logit(y_train, X_train).fit(maxiter=1000, method="cg") print(logit.summary()) X_validate = pd.DataFrame(nmf.transform(counts_validate)) X_validate["intercept"] = 1 y_pred = logit.predict(X_validate) auc = roc_auc_score(y_validate, y_pred) print("AUC:", auc) if (auc - best_auc) > delta: best = i best_auc = auc else: break print("selected", ntopics[best], "topics") # Turn best NMF topics into features features = pd.DataFrame(nmfs[best].transform(counts)) features.columns = [ "MEDICAID_TOPIC_{}".format(i) for i in range(ntopics[best]) ] features["RIIPL_ID"] = pop["RIIPL_ID"] features = features.set_index("RIIPL_ID") # Use the top 10 words in a topic as its description top10words = [ " ".join(words.loc[i, "WORD"] for i in topic.argsort()[-11:-1]) for topic in nmfs[best].components_ ] descs = [ "Topic {} ({})".format(i, words) for i, words in enumerate(top10words) ] labels = dict(zip(features.columns, descs)) SaveFeatures(features, out, manifest, population, labels)