def initial(trainFile, testFile, x, y, state): data1 = pd.read_csv(trainFile) data2 = pd.read_csv(testFile) data1['L0005'] = data1['L0005'] - 10600 data2['L0005'] = data2['L0005'] - 10600 print(data2.head()) print("训练集数量:", len(data1)) print("测试集数量:", len(data2)) titleList = data1.columns.values.tolist() print(titleList) x_keys = [] for key in x: x_keys.append(key) x_keys.append(y) x_keys.append(state) for a in titleList: if a not in x_keys: del data1[a] del data2[a] print(data1.head()) print(data2.head()) trainData = data1[x_keys] testData = data2[x_keys] status = trainData[state].values sentence = y + "~" count = 0 for key in record_keys: if count == 0: sentence = sentence + key else: sentence = sentence + "+" + key count = count + 1 print(sentence) # mod = smf.phreg("futime ~ age + female + creatinine + " # " + year", # trainData, status=status, ties="efron") mod = smf.phreg(sentence, trainData, status=status, ties="efron") rslt = mod.fit() print(rslt.summary()) # 得到h(t|X)=h0(t)exp(X^T*B)的协变量参数B params = {} i = 0 while i < len(record_keys): params[record_keys[i]] = rslt.params[i] i = i + 1 print(params) return trainData, testData, params
def initial(file, x, y, state, id): data = pd.read_csv(file) data = data.dropna() data["gender"] = (data["gender"] == "Male").astype(int) data["Partner"] = (data["Partner"] == "Yes").astype(int) data["Dependents"] = (data["Dependents"] == "Yes").astype(int) data["PhoneService"] = (data["PhoneService"] == "Yes").astype(int) data["MultipleLines"] = data['MultipleLines'].map({ 'No phone service': 0, 'No': 1, 'Yes': 2 }) data["InternetService"] = data['InternetService'].map({ 'No': 0, 'DSL': 1, 'Fiber optic': 2 }) data["TechSupport"] = data['TechSupport'].map({ 'No internet service': 0, 'No': 1, 'Yes': 2 }) data["StreamingTV"] = data['StreamingTV'].map({ 'No internet service': 0, 'No': 1, 'Yes': 2 }) data["Contract"] = data['Contract'].map({ 'Month-to-month': 0, 'One year': 1, 'Two year': 2 }) data["PaperlessBilling"] = (data["PaperlessBilling"] == "Yes").astype(int) data["PaymentMethod"] = data['PaymentMethod'].map({ 'Electronic check': 0, 'Mailed check': 1, 'Bank transfer (automatic)': 2, 'Credit card (automatic)': 3 }) data["Churn"] = (data["Churn"] == "Yes").astype(int) print(data.head()) print("最开始的数量:", len(data)) # 切分为训练集和测试集 titleList = data.columns.values.tolist() print(titleList) x_keys = [] for key in x: x_keys.append(key) x_keys.append(y) x_keys.append(state) x_keys.append(id) y_keys = [] for a in titleList: if a not in x_keys: del data[a] print(data.head()) X = data[x_keys] Y = data[y_keys] seed = 7 test_size = 0.4 trainData, testData, ab, cd = train_test_split(X, Y, test_size=test_size, random_state=seed) print("切分后训练集data:", len(trainData)) print("切分后测试集data:", len(testData)) status = trainData[state].values sentence = y + "~" count = 0 for key in record_keys: if count == 0: sentence = sentence + key else: sentence = sentence + "+" + key count = count + 1 print(sentence) # mod = smf.phreg("futime ~ age + female + creatinine + " # " + year", # trainData, status=status, ties="efron") mod = smf.phreg(sentence, trainData, status=status, ties="efron") rslt = mod.fit() print(rslt.summary()) # 得到h(t|X)=h0(t)exp(X^T*B)的协变量参数B params = {} i = 0 while i < len(record_keys): params[record_keys[i]] = rslt.params[i] i = i + 1 print(params) return trainData, testData, params
notgotit = EUhist.loc[~EUhist['user'].isin(reached)].groupby( 'user')['Seniority_days'].last().reset_index() notgotit['reached'] = 0 dfEU = pd.concat([gotit, notgotit], axis=0) dfEU = pd.merge(dfEU, dta[['Id', 'user_types']], left_on='user', right_on='Id', how='inner', validate='1:1') dfEU = pd.concat( [dfEU, pd.get_dummies(dfEU['user_types'], prefix='type')], axis=1) mod = smf.phreg("Seniority_days ~ 0 + type_2 + type_3", status=dfEU['reached'].values, data=dfEU, ties="efron") rslt = mod.fit() print(rslt.summary()) fig, ax = plt.subplots() for tp in range(1, 4): sf = sm.SurvfuncRight(dfEU.loc[dfEU['user_types'] == tp, "Seniority_days"], dfEU.loc[dfEU['user_types'] == tp, "reached"]) sf.plot(ax) li = ax.get_lines() li[1].set_visible(False) # removes crosses (not clear what they mean) li[3].set_visible(False) li[5].set_visible(False) plt.legend((li[0], li[2], li[4]), ('Type 1', 'Type 2', 'Type 3')) plt.ylim(0.8, 1)
print(sm_probit_manual.cov_params()) ############################################################ # Cox-proportional ############################################################ data = sm.datasets.get_rdataset("flchain", "survival").data del data["chapter"] data = data.dropna() data["lam"] = data["lambda"] data["female"] = (data["sex"] == "F").astype(int) data["year"] = data["sample.yr"] - min(data["sample.yr"]) status = data["death"].values mod = smf.phreg( "futime ~ age + female + creatinine + " "np.sqrt(kappa) + np.sqrt(lam) + year + mgus", data, status=status, ties="efron") rslt = mod.fit() print(rslt.summary()) ############################################################ # phreg on loan data ############################################################ PATH1 = '~/Google Drive/PWBM/CECL/data' # PATH1 = 'c:/Users/hanjh/Documents/Google Drive/PWBM/CECL/data/fannie_mae' PATH2 = os.path.join(PATH1, 'fannie_mae', 'clean') filename = 'noMOD_5M' YEARS = ['2000Q4', '2001Q4', '2002Q4', '2003Q4', '2004Q4', '2005Q4'] df = pd.read_csv(os.path.join(
#!/Users/bernardroesler/anaconda3/envs/insight/bin/python3 #============================================================================== # File: phreg_statsmodels_ex.py # Created: 06/19/2018, 12:17 # Author: Bernie Roesler # """ Description: """ #============================================================================== import statsmodels.api as sm import statsmodels.formula.api as smf data = sm.datasets.get_rdataset("flchain", "survival").data del data["chapter"] data = data.dropna() data["lam"] = data["lambda"] data["female"] = (data["sex"] == "F").astype(int) data["year"] = data["sample.yr"] - min(data["sample.yr"]) status = data["death"].values mod = smf.phreg("futime ~ 0 + age + female + creatinine + " "np.sqrt(kappa) + np.sqrt(lam) + year + mgus", data, status=status, ties="efron") rslt = mod.fit() print(rslt.summary()) #============================================================================== #==============================================================================
import pandas as pd # from sklearn.model_selection import train_test_split # import statsmodels.api as sm import statsmodels.formula.api as smf df = pd.read_csv('df_merged5.csv') # array = df.values # predictors = array[:, 2:-1] # outcome = array[:, -1] df.head() # futime = df.iloc[:, 1] outcome = df.iloc[:, -1] predictors = df.iloc[:, 1:-1] mod = smf.phreg("PERMTH_INT ~ DMPFSEQ_x + BDPEXFLR + BDPSCAN", predictors, status=outcome, ties="efron") result = mod.fit() print(result.summary()) # X_train, X_test, y_train, y_test = train_test_split(predictors, outcome)