def weight_generating(x, y, param=0): print('---------- Generating weights ----------') if param == 0: model = Probit(y, add_constant(x), missing='drop') Probit_model = model.fit() elif param == 1: while True: random_index = [random.choice([True, False]) for _ in range(len(y))] X = x[random_index] Y = y[random_index] model = Probit(Y, add_constant(X), missing='drop') Probit_model = model.fit() if not np.isnan(Probit_model.params[0]): break IM_list = [] for count, i in tqdm(enumerate(y), desc='Computing inverse Mills ratios', ncols=100): if param == 0: IM_list.append(im(Probit_model.fittedvalues[count])) elif param == 1: tmp = [1] tmp.extend(list(x.iloc[count, ])) IM = im(dot(tmp, Probit_model.params)) IM_list.append(IM) weight = np.true_divide(IM_list, np.mean(IM_list)) return weight
def estimate_recession_probability(cls, macro_data=None, macro_indicator=None, start_date=None): import scipy from statsmodels.discrete.discrete_model import Logit, Probit recession_data, recession_start_dates, recession_end_dates = \ cls.retrieve_recession_data(macro_data=macro_data, start_date=start_date) # Settings for the analysis recession_prediction_window = 252 windows = [1, 3, 6, 9, 12, 18, 24] halflifes = [0.5, 1, 2, 3, 6, 9, 12] r2 = [] ll = [] recession_probs = pd.DataFrame(index=macro_indicator.index, columns=halflifes) recession_in_window = pd.DataFrame(index=macro_indicator.index) tmp = recession_data.rolling(window=recession_prediction_window, center=False).sum() tmp2 = tmp.shift(-recession_prediction_window) recession_in_window['RIW'] = np.minimum(1, tmp2) logits = [] regs = [] for i in range(0, len(windows)): logit_iv = pd.DataFrame(index=recession_data.index) logit_iv['Recession'] = recession_data logit_iv['RIW'] = recession_in_window['RIW'] # Exponential moving average of the macro indicator window_ma = macro_indicator.ewm(halflife=halflifes[i], ignore_na=False, min_periods=0, adjust=True).mean() logit_iv['F1M'] = window_ma.copy() logit_iv['LF1'] = window_ma.shift(windows[i]) logit_iv['DF1'] = window_ma.diff(halflifes[i] * 2) logit_iv['INT'] = logit_iv['F1M'] * logit_iv['DF1'] logit_iv = logit_iv[(~np.isnan(logit_iv['DF1'])) & (~np.isnan(logit_iv['F1M']))] logit_iv['RIW'][np.isnan(logit_iv['RIW'])] = 0 # tmp = Logit(logit_iv['RIW'], logit_iv[['F1M', 'DF1', 'INT']]) tmp = Probit(logit_iv['RIW'], logit_iv['F1M']) # tmp = linear_model.OLS(logit_iv['RIW'], pd.DataFrame(logit_iv['F1M'])) result = tmp.fit() predictions = result.fittedvalues[~np.isnan(result.fittedvalues)] logits.append(result) ll.append(result.llr) # recession_probs[str(i)] = np.exp(predictions) / (1 + np.exp(predictions)) recession_probs.loc[logit_iv.index, halflifes[i]] \ = scipy.stats.norm.cdf(predictions)
def probit_parameterized_ranked(x, y, arrival_choice, cut_choice, cap_mode_choice=2, save=True): """ Run probit for specified choices Args: x (pd.DataFrame): Features y (pd.DataFrame): Target cap_mode_choice (int, optional): eco + 0 for slot capacity only, 1 for slot + daily average, 2 for all arrival_choice (int): Choice of arrival day. -1 for all. cut_choice: (int): -1 for all, 0 for before cut 1, 2 for before cut 2 and after and 2 for both 0 and 1 save (bool): If true, saves output in text file Returns: Model """ x = sm.add_constant(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0) model = Probit(y_train, x_train) probit_model = model.fit() print(probit_model.summary()) file_name = 'probit_arrival-{}_cut-{}_cap-{}.txt'.format( arrival_choice, cut_choice, cap_mode_choice) with open( os.path.join(os.path.join(results_path, zone, 'ranked'), file_name), "w") as text_file: text_file.write(str(probit_model.summary())) r2 = r2_score(y_test, probit_model.predict(x_test)) if save: output = { 'mode': cap_mode_choice, 'arrival_choice': arrival_choice, 'cut_choice': cut_choice } for key, value in probit_model.params.items(): output[key] = value output['r2'] = r2 file_name = 'probit_arrival-{}_cut-{}_cap-{}.txt'.format( arrival_choice, cut_choice, cap_mode_choice) with open( os.path.join(os.path.join(results_path, zone, 'ranked'), file_name), "w") as text_file: text_file.write(str(probit_model.summary())) save_result_to_file(pd.DataFrame([output]), 'probit.csv', os.path.join(results_path, zone, 'ranked')) print('R2 score = ', r2) return probit_model
def setup_class(cls): df = data_bin mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial(link=families.links.probit())) res = mod.fit(method="newton", tol=1e-10) from statsmodels.discrete.discrete_model import Probit mod2 = Probit(df['constrict'], df[['const', 'log_rate', 'log_volumne']]) res2 = mod2.fit(method="newton", tol=1e-10) cls.infl1 = MLEInfluence(res) # res.get_influence() cls.infl0 = res2.get_influence()
def probitfn(dataset): Y = dataset.RawData['f'] X = dataset.RawData['H'] model = Probit(Y, X.astype(float)) probit_model = model.fit() #a = probit_model.fittedvalues() #print(probit_model.summary()) #print(np.mean(Probit.pdf(Y, X))) inverseM = -1 * Probit.pdf(Y, X) / Probit.cdf(Y, X) #print(inverseM) return inverseM
def probit(): ''' A function for running the probit. ''' p = {} p['γ'] = 0.8 p['β'] = 1 p['a'] = 0 p['ρ'] = 1 p['η'] = 0.2 p['δ'] = -0.2 p['δ0'] = -0.1 p['ν'] = 0.5 simdata_a0 = LaborSupplySim(p, 10000) simdata_a0.simulate() simdata_a0.generate_panel() Y = simdata_a0.PanelData['LFP'] X = simdata_a0.PanelData['Z'] model = Probit(Y, X.astype(float)) probit_model = model.fit() print(probit_model.summary())
def probit(df, y_var, X_vars, add_intercept=True): """ This function replicates probit in STATA, for probit model. 至少有一个固定效应变量,至多只能有两个。 被解释变量y为0-1变量时,模型才有意义 Inputs. --------- df:pd.DataFrame, the data for OLS. y_var:str, the column name of the dependent variable, 被解释变量y应为0-1变量 X_vars:list of str, the list of explanatory variable names Outputs. --------- res:obj """ new_df = df.copy() new_df = new_df.dropna() y = new_df[y_var] if add_intercept: new_df['intercept'] = 1.0 X = new_df[['intercept'] + X_vars] else: X = new_df[X_vars] probit_mod = Probit(endog=y, exog=X, check_rank=True, missing="drop") res = probit_mod.fit(start_params=None, method='newton', maxiter=35, full_output=1, disp=1, callback=None) return res
import pandas as pd import numpy as np from preprocessor import Preprocessor from statsmodels.discrete.discrete_model import Probit pd.set_option('display.max_columns', 10) df = pd.read_excel("Project 2 - Data.xls") preprocessor = Preprocessor(df) x_train, y_train, x_test, y_test = preprocessor.combine() model = Probit(y_train, x_train) probit_model = model.fit() predict_proba = probit_model.predict(x_test) def predict(predict_proba): prediction = [] for probability in predict_proba: if probability > 0.5: prediction.append(1) else: prediction.append(0) return prediction def model_score(prediction, target): score = 0 for i in range(len(prediction)): if prediction[i] == target[i]: score += 1
def table1_reg(df_reg, disp_it): """Function to create the tables for the first probit models. Args: dataFrame containing the categorial variables as dummies and the interaction terms disp_it boolean value indicating whether information about iterations should be displayed Returns: ------- A table containing the regression output of the first 4 model specifications. """ #first model Y = df_reg['_oral'] X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970']] X['int'] = np.repeat(1, len(Y)) model1 = Probit(Y, X) probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model1.summary()) #got same results as paper #compute margins (get_margeff) probit_margeff1 = probit_model1.get_margeff() #probit_margeff1.summary() #second model Y = df_reg['_oral'] X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970']] X['int'] = np.repeat(1, len(Y)) model2 = Probit(Y, X) probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model2.summary()) #got same results as paper #compute margins (get_margeff) probit_margeff2 = probit_model2.get_margeff() probit_margeff2.summary() #third model Y = df_reg['_oral'] X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35',\ 'd_agecat20X1970', 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','_Catholic' ,'_CatholicX1970',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', 'd_ed_cat13X1970', \ 'd_ed_cat16X1970', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', 'd_hinccat1X1970', 'd_hinccat2X1970', 'd_hinccat3X1970', 'd_hinccat4X1970']] X['int'] = np.repeat(1, len(Y)) model3 = Probit(Y, X) probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model3.summary()) #compute margins (get_margeff) probit_margeff3 = probit_model3.get_margeff() #probit_margeff3.summary() #fourth model Y = df_reg['_oral'] X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35',\ 'd_agecat20X1970', 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','_Catholic' ,'_CatholicX1970',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', 'd_ed_cat13X1970', \ 'd_ed_cat16X1970', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', 'd_hinccat1X1970', 'd_hinccat2X1970', 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \ 'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970']] X['int'] = np.repeat(1, len(Y)) model4 = Probit(Y, X) probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model4.summary()) #compute margins (get_margeff) probit_margeff4 = probit_model4.get_margeff() #print(probit_margeff4.summary()) table = pd.DataFrame({'(1)': [], '(2)': [], '(3)': [], '(4)': []}) table[' '] = ['Sales ban', '','p-value', 'Sales ban x 1(1970)', ' ','p-value', 'Observations', 'Log Likelihood', \ 'Additional Covariates', 'Legal Variables'] table = table.set_index(' ') table['(1)'] = [round(probit_margeff1.margeff[0],3), '({})'.format(round(probit_margeff1.margeff_se[0],3)), round(probit_margeff1.pvalues[0],3), round(probit_margeff1.margeff[2],3), \ '({})'.format(round(probit_margeff1.margeff_se[2],3)), round(probit_margeff1.pvalues[2],3), round(probit_margeff1.results.nobs,3), round(probit_margeff1.results.llf,3),\ 'R','PX' ] table['(2)'] = [round(probit_margeff2.margeff[0],3), '({})'.format(round(probit_margeff2.margeff_se[0],3)), round(probit_margeff2.pvalues[0],3), round(probit_margeff2.margeff[2],3), \ '({})'.format(round(probit_margeff2.margeff_se[2],3)), round(probit_margeff2.pvalues[2],3), round(probit_margeff2.results.nobs,3), round(probit_margeff2.results.llf,3),\ 'R','PX, AD' ] table['(3)'] = [round(probit_margeff3.margeff[0],3), '({})'.format(round(probit_margeff3.margeff_se[0],3)), round(probit_margeff3.pvalues[0],3), round(probit_margeff3.margeff[2],3), \ '({})'.format(round(probit_margeff3.margeff_se[2],3)), round(probit_margeff3.pvalues[2],3), round(probit_margeff3.results.nobs,3), round(probit_margeff3.results.llf,3),\ 'R,A,C,E,I','PX, AD' ] table['(4)'] = [round(probit_margeff4.margeff[0],3), '({})'.format(round(probit_margeff4.margeff_se[0],3)), round(probit_margeff4.pvalues[0],3), round(probit_margeff4.margeff[2],3), \ '({})'.format(round(probit_margeff4.margeff_se[2],3)), round(probit_margeff4.pvalues[2],3), round(probit_margeff4.results.nobs,3), round(probit_margeff4.results.llf,3),\ 'R,A,C,E,I','PX, AD, K' ] return table, model1, model2, model3, model4
def table2_reg(df_reg, disp_it): """Function to create the tables for the second probit models. Args: dataFrame containing the categorial variables as dummies and the interaction terms Returns: ------- A table containing the regression output of the 8 model specifications for the second table. """ #1. _everuse_d as dependent variable #first model Y = df_reg['_everuse_d'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965']] X['int'] = np.repeat(1, len(Y)) model1 = Probit(Y, X) probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model1.summary()) #got same results as paper #compute margins (get_margeff) probit_margeff1 = probit_model1.get_margeff() #probit_margeff1.summary() #second model Y = df_reg['_everuse_d'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965']] X['int'] = np.repeat(1, len(Y)) model2 = Probit(Y, X) probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model2.summary()) #got same results as paper #compute margins (get_margeff) probit_margeff2 = probit_model2.get_margeff() probit_margeff2.summary() #third model Y = df_reg['_everuse_d'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \ 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \ 'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \ 'd_ed_cat13X1970', 'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \ 'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \ 'd_hinccat1X1970', 'd_hinccat2X1970', \ 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \ 'd_hinccat4X1965']] X['int'] = np.repeat(1, len(Y)) model3 = Probit(Y, X) probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model3.summary()) #compute margins (get_margeff) probit_margeff3 = probit_model3.get_margeff() probit_margeff3.summary() #fourth model Y = df_reg['_everuse_d'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \ 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \ 'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \ 'd_ed_cat13X1970', 'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \ 'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \ 'd_hinccat1X1970', 'd_hinccat2X1970', \ 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \ 'd_hinccat4X1965', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \ 'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970', 'd_idealcat2X1965', \ 'd_idealcat3X1965', 'd_idealcat4X1965', 'd_idealcat5X1965']] X['int'] = np.repeat(1, len(Y)) model4 = Probit(Y, X) probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model4.summary()) #compute margins (get_margeff) probit_margeff4 = probit_model4.get_margeff() probit_margeff4.summary() #store results model1_help = model1 model2_help = model2 model3_help = model3 model4_help = model3 #2. _barrier as dependent variable #first model Y = df_reg['_barrier'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965']] X['int'] = np.repeat(1, len(Y)) model1 = Probit(Y, X) probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model1.summary()) #got same results as paper #compute margins (get_margeff) probit_margeffb1 = probit_model1.get_margeff() probit_margeffb1.summary() #second model Y = df_reg['_barrier'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965']] X['int'] = np.repeat(1, len(Y)) model2 = Probit(Y, X) probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model2.summary()) #got same results as paper #compute margins (get_margeff) probit_margeffb2 = probit_model2.get_margeff() probit_margeffb2.summary() #third model Y = df_reg['_barrier'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \ 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \ 'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \ 'd_ed_cat13X1970', 'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \ 'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \ 'd_hinccat1X1970', 'd_hinccat2X1970', \ 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \ 'd_hinccat4X1965']] X['int'] = np.repeat(1, len(Y)) model3 = Probit(Y, X) probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model3.summary()) #compute margins (get_margeff) probit_margeffb3 = probit_model3.get_margeff() probit_margeffb3.summary() #fourth model Y = df_reg['_barrier'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \ 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \ 'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \ 'd_ed_cat13X1970', 'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \ 'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \ 'd_hinccat1X1970', 'd_hinccat2X1970', \ 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \ 'd_hinccat4X1965', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \ 'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970', 'd_idealcat2X1965', \ 'd_idealcat3X1965', 'd_idealcat4X1965', 'd_idealcat5X1965']] X['int'] = np.repeat(1, len(Y)) model4 = Probit(Y, X) probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp=disp_it) #print(probit_model4.summary()) #compute margins (get_margeff) probit_margeffb4 = probit_model4.get_margeff() probit_margeffb4.summary() #3. create table for output table = pd.DataFrame({'(1)': [], '(2)': [], '(3)': [], '(4)': []}) table[' '] = ['Ever used Pill','Sales ban', '','p-value', 'Sales ban x 1(1965)', ' ','p-value', 'Sales ban x 1(1970)', ' ','p-value',\ 'Obersvations', 'Log Likelihood', ' ', 'Ever used barrier', 'Sales ban', '','p-value', 'Sales ban x 1(1965)', ' ',\ 'p-value', 'Sales ban x 1(1970)', ' ','p-value',\ 'Obersvations', 'Log Likelihood', \ 'Additional Covariates', 'Legal Variables'] table = table.set_index(' ') table['(1)'] = [' ', round(probit_margeff1.margeff[0],3), '({})'.format(round(probit_margeff1.margeff_se[0],3)),\ round(probit_margeff1.pvalues[0],3), round(probit_margeff1.margeff[4],3), \ '({})'.format(round(probit_margeff1.margeff_se[4],3)), round(probit_margeff1.pvalues[4],3),\ round(probit_margeff1.margeff[3],3), \ '({})'.format(round(probit_margeff1.margeff_se[3],3)), round(probit_margeff1.pvalues[3],3),\ round(probit_margeff1.results.nobs,3), round(probit_margeff1.results.llf,3),\ ' ', ' ', round(probit_margeffb1.margeff[0],3), '({})'.format(round(probit_margeffb1.margeff_se[0],3)),\ round(probit_margeffb1.pvalues[0],3), round(probit_margeffb1.margeff[4],3), \ '({})'.format(round(probit_margeffb1.margeff_se[4],3)), round(probit_margeffb1.pvalues[4],3),\ round(probit_margeffb1.margeff[3],3), '({})'.format(round(probit_margeffb1.margeff_se[3],3)),\ round(probit_margeffb1.pvalues[3],3), round(probit_margeffb1.results.nobs,3),\ round(probit_margeffb1.results.llf,3), 'R','PX'] table['(2)'] = [' ', round(probit_margeff2.margeff[0],3), '({})'.format(round(probit_margeff2.margeff_se[0],3)),\ round(probit_margeff2.pvalues[0],3), round(probit_margeff2.margeff[4],3), \ '({})'.format(round(probit_margeff2.margeff_se[4],3)), round(probit_margeff2.pvalues[4],3),\ round(probit_margeff2.margeff[3],3), \ '({})'.format(round(probit_margeff2.margeff_se[3],3)), round(probit_margeff2.pvalues[3],3),\ round(probit_margeff2.results.nobs,3), round(probit_margeff2.results.llf,3),\ ' ', ' ', round(probit_margeffb2.margeff[0],3), '({})'.format(round(probit_margeffb2.margeff_se[0],3)),\ round(probit_margeffb2.pvalues[0],3), round(probit_margeffb2.margeff[4],3), \ '({})'.format(round(probit_margeffb2.margeff_se[4],3)), round(probit_margeffb2.pvalues[4],3),\ round(probit_margeffb2.margeff[3],3), '({})'.format(round(probit_margeffb2.margeff_se[3],3)),\ round(probit_margeffb2.pvalues[3],3), round(probit_margeffb2.results.nobs,3),\ round(probit_margeffb2.results.llf,3), \ 'R','PX, AD' ] table['(3)'] = [' ', round(probit_margeff3.margeff[0],3), '({})'.format(round(probit_margeff3.margeff_se[0],3)),\ round(probit_margeff3.pvalues[0],3), round(probit_margeff3.margeff[4],3), \ '({})'.format(round(probit_margeff3.margeff_se[4],3)), round(probit_margeff3.pvalues[4],3),\ round(probit_margeff3.margeff[3],3), \ '({})'.format(round(probit_margeff3.margeff_se[3],3)), round(probit_margeff3.pvalues[3],3),\ round(probit_margeff3.results.nobs,3), round(probit_margeff3.results.llf,3),\ ' ', ' ', round(probit_margeffb3.margeff[0],3), '({})'.format(round(probit_margeffb3.margeff_se[0],3)),\ round(probit_margeffb3.pvalues[0],3), round(probit_margeffb3.margeff[4],3), \ '({})'.format(round(probit_margeffb3.margeff_se[4],3)), round(probit_margeffb3.pvalues[4],3),\ round(probit_margeffb3.margeff[3],3), '({})'.format(round(probit_margeffb3.margeff_se[3],3)),\ round(probit_margeffb3.pvalues[3],3), round(probit_margeffb3.results.nobs,3),\ round(probit_margeffb3.results.llf,3), 'R,A,C,E,I','PX, AD' ] table['(4)'] = [' ', round(probit_margeff4.margeff[0],3), '({})'.format(round(probit_margeff4.margeff_se[0],3)),\ round(probit_margeff4.pvalues[0],3), round(probit_margeff4.margeff[4],3), \ '({})'.format(round(probit_margeff4.margeff_se[4],3)), round(probit_margeff4.pvalues[4],3),\ round(probit_margeff4.margeff[3],3), \ '({})'.format(round(probit_margeff4.margeff_se[3],3)), round(probit_margeff4.pvalues[3],3),\ round(probit_margeff4.results.nobs,3), round(probit_margeff4.results.llf,3),\ ' ', ' ', round(probit_margeffb4.margeff[0],3), '({})'.format(round(probit_margeffb4.margeff_se[0],3)),\ round(probit_margeffb4.pvalues[0],3), round(probit_margeffb4.margeff[4],3), \ '({})'.format(round(probit_margeffb4.margeff_se[4],3)), round(probit_margeffb4.pvalues[4],3),\ round(probit_margeffb4.margeff[3],3), '({})'.format(round(probit_margeffb4.margeff_se[3],3)),\ round(probit_margeffb4.pvalues[3],3), round(probit_margeffb4.results.nobs,3),\ round(probit_margeffb4.results.llf,3), 'R,A,C,E,I','PX, AD, K' ] return table, model1, model2, model3, model4, model1_help, model2_help, model3_help, model4_help
#https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.Probit.html from statsmodels.discrete.discrete_model import Probit p = Probit(df.child.map({'yes': 1, 'no': 0}), df[['age']]) a = p.fit() a.summary2()
def core(tsy, dfx, map={0: '女', 1: '男'}): assert set(tsy.unique()) == {0, 1}, 'Y值只能为0或1' dfx = sm.add_constant(dfx, prepend=True) dfx = dfx.rename(columns={'const': '截距'}) p = Probit(tsy, dfx) res = p.fit() #summary2 = res.summary2() #predict result prediction_probs = res.predict() prediction_bins = pd.Series( [1 if i >= 0.5 else 0 for i in prediction_probs], name='predicted_bins', index=tsy.index) tsy_predict = prediction_bins tsy_predict.name = '预测的' + tsy.name df_predict_result = pd.concat([tsy, tsy_predict], axis=1) #confusion matrix df_confusion_matrix = pd.DataFrame(confusion_matrix(tsy, tsy_predict), index=tsy.unique(), columns=tsy.unique()) df_confusion_matrix.index = df_confusion_matrix.index.map(map) df_confusion_matrix.columns = df_confusion_matrix.columns.map(map) #report df_report = pd.DataFrame(list( precision_recall_fscore_support(tsy, tsy_predict)), index=['精确度', '召回率', 'F1-值', '样本个数']).T.round(5) df_report.index = df_report.index.map(map) #roc fpr, tpr, thresholds = roc_curve(tsy, prediction_probs) roc_auc = auc(fpr, tpr) #logging.info("Area under the ROC curve : %f" % roc_auc) i = np.arange(len(tpr)) # index for df df_roc = pd.DataFrame({ '假阳性率': pd.Series(fpr, index=i), '真阳性率': pd.Series(tpr, index=i) }) #model description tables = res.summary().tables df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables] dfinfo1 = df_list[1].fillna('Variables').set_index(0) dfinfo1 = dfinfo1.T.set_index('Variables').T dfinfo1.index.name = '项' dfinfo1.columns.name = '参数类型' dfinfo1.columns = ['回归系数', '标准误差', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)'] dfinfo1['or值'] = np.exp(res.params) df_description = dfinfo1 df_report = df_report.append(df_report.sum().to_frame(name='总和/平均').T) df_report['召回率'].loc['总和/平均'] = df_report['召回率'].loc['总和/平均'] / 2 df_report['F1-值'].loc['总和/平均'] = df_report['F1-值'].loc['总和/平均'] / 2 df_report = df_report.T df_report['name'] = ['模型效果', '模型效果', '模型效果', '样本量'] df_confusion_matrix = df_confusion_matrix.append( df_confusion_matrix.sum().to_frame(name='总和/平均').T) df_confusion_matrix = df_confusion_matrix.T df_confusion_matrix['name'] = ['混淆矩阵', '混淆矩阵'] df_confusion_matrix = df_confusion_matrix.append( df_report).reset_index().set_index(['name', 'index']) df_confusion_matrix = df_confusion_matrix.T df_confusion_matrix.columns.names = [None, None] df_predict_result = df_predict_result.round(5) df_confusion_matrix = df_confusion_matrix.round(5) df_roc = df_roc.round(5) df_description = df_description.round(5) #self._debug = df_confusion_matrix return [{ 'tables': [ { 'table_info': '二元Probit回归分析结果汇总', 'table_json': '{}', 'table_html': df_description.to_html(), 'chart': ['line', 'bar'] }, { 'table_info': '二元Probit回归预测效果汇总:', 'table_json': df_confusion_matrix.T.reset_index().to_json(), 'table_html': df_confusion_matrix.to_html(), 'chart': [] }, { 'table_info': "ROC曲线(曲线下面积:%0.3f)" % roc_auc, 'table_json': df_roc.to_json(), 'table_html': df_roc.to_html(), 'chart': ['scatter'] }, ] }, [{ 'table_df': df_predict_result, 'label': '实际值与预测值' }]]
def probit_reg(x, y): """Univariate probit regression""" x = np.append(np.ones(10).reshape(-1, 1), x.reshape(-1, 1), axis=1).reshape(len(x), 2) pm = Probit(y, x) return pm.fit().params
import pandas as pd import numpy as np import sklearn import matplotlib.pyplot as plt import seaborn as sns import numpy '''Probit analysis plus plotting 3D graph of hit rate distribution with respect to delta and theta''' from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = fig.add_subplot(111, projection='3d') # Receive Data data = pd.read_csv("HL.csv") print(data) col = ["delta", "epsilon", "cross_term"] dep_var = data["hits"].tolist() X = data[col] theta = data["theta"] z = Probit(dep_var, X) result = z.fit() print(result.summary()) z = np.array(data["hits"].tolist()) x = np.array(data["epsilon"].tolist()) y = np.array(data["delta"].tolist()) print(z) ax.scatter(x, y, z, s=1, c=None, depthshade=True) plt.show()
def main(): # Magic numbers dMux = 0 dSigmax = 1 dMuepsilon = 0 dSigmaepsilon = 1 dMueta = 0 dSigmaeta = 1 iNobs = 1000 vdBeta = np.array([1, 2]) vdZeta = np.array([3, 4]) vdDezinho = np.array([0]) iSeed = 6969 iNgroups = 11 iIter = 1000 # Initialisation np.random.seed(iSeed) vdBeta = np.array(vdBeta).reshape(-1, 1) vdZeta = np.array(vdZeta).reshape(-1, 1) iLenbeta = len(vdBeta) # Start the iterations ## Create objects to store the ATE, variance, test statistics and R-Squares dvATE = np.ones(iIter) dvVar = np.ones(iIter) dvTtest = np.ones(iIter) dvRsquared = np.ones(iIter) for i in range(iIter - 1): mdX = fnGenX(iNobs, iLenbeta, dMux, dSigmax) iLenX = mdX.shape[1] vdEpsilon = fnGenError(iNobs, dMuepsilon, dSigmaepsilon) vdPstar = fnGenPstar(mdX, vdBeta, vdEpsilon) vdD = fnGenTreat(vdPstar) vdEta = fnGenError(iNobs, dMueta, dSigmaeta) vdY = fnGenY(vdD, vdDezinho, mdX, vdZeta, vdEta) ## Create a dataframe with everything together ### This is not good because of the names, if we change the size of X then we need to manually change this, but I can check later how to make this better if needed dfData = pd.DataFrame(np.hstack([vdY, vdD, mdX]), columns=['vdY', 'vdD', 'vdX1', 'vdX2']) dfData["vdD"] = dfData["vdD"] == 1 ### Can work out later in a better layout for these descriptives #print dfData.groupby('vdD').describe().unstack(1).reset_index() # Estimation model = Probit(dfData['vdD'], dfData[dfData.columns[-mdX.shape[1]:]].copy()) probit_model = model.fit() #print(probit_model.summary()) dRsquare = probit_model.prsquared # Get the predicted probabilities vdProbs = probit_model.predict( dfData[dfData.columns[-mdX.shape[1]:]].copy()) ## Looking at the estimated probabilities #plt.figure(figsize=[10,8]) #n, bins, patches = plt.hist(x=vdProbs, bins=8, color='#0504aa',alpha=0.7, rwidth=0.85) #plt.grid(axis='y', alpha=0.75) #plt.xlabel('Value',fontsize=15) #plt.ylabel('Frequency',fontsize=15) #plt.xticks(fontsize=15) #plt.yticks(fontsize=15) #plt.ylabel('Frequency',fontsize=15) #plt.title('Propensity Score Histogram',fontsize=15) #plt.show() ## Building the groups vdGroups = np.linspace(0, 1, iNgroups) ## Putting back Y, treatment and the propensity score dfFinalData = pd.DataFrame(np.hstack( [vdY, vdD, vdProbs.reshape(-1, 1)]), columns=['vdY', 'vdD', 'vdPS']) #dfGroup1 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[0]) & (dfFinalData['vdPS'] < vdGroups[1])] dfGroup2 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[1]) & (dfFinalData['vdPS'] < vdGroups[2])] dfGroup3 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[2]) & (dfFinalData['vdPS'] < vdGroups[3])] dfGroup4 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[3]) & (dfFinalData['vdPS'] < vdGroups[4])] dfGroup5 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[4]) & (dfFinalData['vdPS'] < vdGroups[5])] dfGroup6 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[5]) & (dfFinalData['vdPS'] < vdGroups[6])] dfGroup7 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[6]) & (dfFinalData['vdPS'] < vdGroups[7])] dfGroup8 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[7]) & (dfFinalData['vdPS'] < vdGroups[8])] dfGroup9 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[8]) & (dfFinalData['vdPS'] < vdGroups[9])] #dfGroup10 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[9]) & (dfFinalData['vdPS'] < vdGroups[10])] #dMean1 = dfGroup1.groupby('vdD').mean().iloc[1, 0] - dfGroup1.groupby('vdD').mean().iloc[0, 0] dMean2 = (dfGroup2.groupby('vdD').mean().iloc[1, 0] - dfGroup2.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup2.shape[0] / float(iNobs)) dMean3 = (dfGroup3.groupby('vdD').mean().iloc[1, 0] - dfGroup3.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup3.shape[0] / float(iNobs)) dMean4 = (dfGroup4.groupby('vdD').mean().iloc[1, 0] - dfGroup4.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup4.shape[0] / float(iNobs)) dMean5 = (dfGroup5.groupby('vdD').mean().iloc[1, 0] - dfGroup5.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup5.shape[0] / float(iNobs)) dMean6 = (dfGroup6.groupby('vdD').mean().iloc[1, 0] - dfGroup6.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup6.shape[0] / float(iNobs)) dMean7 = (dfGroup7.groupby('vdD').mean().iloc[1, 0] - dfGroup7.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup7.shape[0] / float(iNobs)) dMean8 = (dfGroup8.groupby('vdD').mean().iloc[1, 0] - dfGroup8.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup8.shape[0] / float(iNobs)) dMean9 = (dfGroup9.groupby('vdD').mean().iloc[1, 0] - dfGroup9.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup9.shape[0] / float(iNobs)) #dMean10 = dfGroup10.groupby('vdD').mean().iloc[1, 0] - dfGroup10.groupby('vdD').mean().iloc[0, 0] dATE = dMean2 + dMean3 + dMean4 + dMean5 + dMean6 + dMean7 + dMean8 + dMean9 # Add an extra column with the mean of the corresponding treatment or no treatment inside the same block dfGroup2['vdYmean'] = dfGroup2.groupby("vdD")["vdY"].transform('mean') dfGroup3['vdYmean'] = dfGroup3.groupby("vdD")["vdY"].transform('mean') dfGroup4['vdYmean'] = dfGroup4.groupby("vdD")["vdY"].transform('mean') dfGroup5['vdYmean'] = dfGroup5.groupby("vdD")["vdY"].transform('mean') dfGroup6['vdYmean'] = dfGroup6.groupby("vdD")["vdY"].transform('mean') dfGroup7['vdYmean'] = dfGroup7.groupby("vdD")["vdY"].transform('mean') dfGroup8['vdYmean'] = dfGroup8.groupby("vdD")["vdY"].transform('mean') dfGroup9['vdYmean'] = dfGroup9.groupby("vdD")["vdY"].transform('mean') # Take the difference between the individual Y and the average of the corresponding group (by treated and non-treated) dfGroup2['dvDiffSquared'] = (dfGroup2['vdY'] - dfGroup2['vdYmean'])**2 dfGroup3['dvDiffSquared'] = (dfGroup3['vdY'] - dfGroup3['vdYmean'])**2 dfGroup4['dvDiffSquared'] = (dfGroup4['vdY'] - dfGroup4['vdYmean'])**2 dfGroup5['dvDiffSquared'] = (dfGroup5['vdY'] - dfGroup5['vdYmean'])**2 dfGroup6['dvDiffSquared'] = (dfGroup6['vdY'] - dfGroup6['vdYmean'])**2 dfGroup7['dvDiffSquared'] = (dfGroup7['vdY'] - dfGroup7['vdYmean'])**2 dfGroup8['dvDiffSquared'] = (dfGroup8['vdY'] - dfGroup8['vdYmean'])**2 dfGroup9['dvDiffSquared'] = (dfGroup9['vdY'] - dfGroup9['vdYmean'])**2 # For each line, add the number of individuals in the same treatment (or no treatment) group dfGroup2['iSizeGroup'] = dfGroup2.groupby("vdD")["vdY"].transform( 'count') dfGroup3['iSizeGroup'] = dfGroup3.groupby("vdD")["vdY"].transform( 'count') dfGroup4['iSizeGroup'] = dfGroup4.groupby("vdD")["vdY"].transform( 'count') dfGroup5['iSizeGroup'] = dfGroup5.groupby("vdD")["vdY"].transform( 'count') dfGroup6['iSizeGroup'] = dfGroup6.groupby("vdD")["vdY"].transform( 'count') dfGroup7['iSizeGroup'] = dfGroup7.groupby("vdD")["vdY"].transform( 'count') dfGroup8['iSizeGroup'] = dfGroup8.groupby("vdD")["vdY"].transform( 'count') dfGroup9['iSizeGroup'] = dfGroup9.groupby("vdD")["vdY"].transform( 'count') # Divide the squared difference by the square of the size of the corresponding group dfGroup2['dvDiffSquaredDivided'] = dfGroup2[ 'dvDiffSquared'] / dfGroup2['iSizeGroup']**2 dfGroup3['dvDiffSquaredDivided'] = dfGroup3[ 'dvDiffSquared'] / dfGroup3['iSizeGroup']**2 dfGroup4['dvDiffSquaredDivided'] = dfGroup4[ 'dvDiffSquared'] / dfGroup4['iSizeGroup']**2 dfGroup5['dvDiffSquaredDivided'] = dfGroup5[ 'dvDiffSquared'] / dfGroup5['iSizeGroup']**2 dfGroup6['dvDiffSquaredDivided'] = dfGroup6[ 'dvDiffSquared'] / dfGroup6['iSizeGroup']**2 dfGroup7['dvDiffSquaredDivided'] = dfGroup7[ 'dvDiffSquared'] / dfGroup7['iSizeGroup']**2 dfGroup8['dvDiffSquaredDivided'] = dfGroup8[ 'dvDiffSquared'] / dfGroup8['iSizeGroup']**2 dfGroup9['dvDiffSquaredDivided'] = dfGroup9[ 'dvDiffSquared'] / dfGroup9['iSizeGroup']**2 # Sum the V term for treated and non-treated individuals and multiply by the size of the block divided by population squared dVGroup2 = (dfGroup2.groupby("vdD").sum().iloc[1, 5] + dfGroup2.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup2.shape[0] / float(iNobs))**2) dVGroup3 = (dfGroup3.groupby("vdD").sum().iloc[1, 5] + dfGroup3.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup3.shape[0] / float(iNobs))**2) dVGroup4 = (dfGroup4.groupby("vdD").sum().iloc[1, 5] + dfGroup4.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup4.shape[0] / float(iNobs))**2) dVGroup5 = (dfGroup5.groupby("vdD").sum().iloc[1, 5] + dfGroup5.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup5.shape[0] / float(iNobs))**2) dVGroup6 = (dfGroup6.groupby("vdD").sum().iloc[1, 5] + dfGroup6.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup6.shape[0] / float(iNobs))**2) dVGroup7 = (dfGroup7.groupby("vdD").sum().iloc[1, 5] + dfGroup7.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup7.shape[0] / float(iNobs))**2) dVGroup8 = (dfGroup8.groupby("vdD").sum().iloc[1, 5] + dfGroup8.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup8.shape[0] / float(iNobs))**2) dVGroup9 = (dfGroup9.groupby("vdD").sum().iloc[1, 5] + dfGroup9.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup9.shape[0] / float(iNobs))**2) # Compute the variance dVar = dVGroup2 + dVGroup3 + dVGroup4 + dVGroup5 + dVGroup6 + dVGroup7 + dVGroup8 + dVGroup9 # Output #print ("ATE= %g" % dATE) #print ("Estimated Variance = %g" % dVar) # Compute the test statistic dTTest = dATE / (math.sqrt(dVar / iNobs)) # Store results dvATE[i] = dATE dvVar[i] = dVar dvTtest[i] = dTTest dvRsquared[i] = dRsquare # Report results pd.DataFrame(stats.describe(dvATE[:-1]))
from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split import seaborn as sns sns.set(style="white") sns.set(style="whitegrid", color_codes=True) import statsmodels.api as sm from statsmodels.discrete.discrete_model import Probit # In[13]: y = df["Outcome"] x = df[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age", "DiabetesPedigreeFunction"]] logit_model = sm.Logit(y,x) result=logit_model.fit() print(result.summary()) # # Probit Regression # In[14]: probitmodel = Probit(y,x) probit_model = probitmodel.fit() print(probit_model.summary())