Exemplo n.º 1
0
def initial(trainFile, testFile, x, y, state):
    data1 = pd.read_csv(trainFile)
    data2 = pd.read_csv(testFile)

    data1['L0005'] = data1['L0005'] - 10600
    data2['L0005'] = data2['L0005'] - 10600
    print(data2.head())
    print("训练集数量:", len(data1))
    print("测试集数量:", len(data2))

    titleList = data1.columns.values.tolist()
    print(titleList)
    x_keys = []
    for key in x:
        x_keys.append(key)
    x_keys.append(y)
    x_keys.append(state)
    for a in titleList:
        if a not in x_keys:
            del data1[a]
            del data2[a]
    print(data1.head())
    print(data2.head())
    trainData = data1[x_keys]
    testData = data2[x_keys]

    status = trainData[state].values

    sentence = y + "~"
    count = 0
    for key in record_keys:
        if count == 0:
            sentence = sentence + key
        else:
            sentence = sentence + "+" + key
        count = count + 1
    print(sentence)

    # mod = smf.phreg("futime ~ age + female + creatinine + "
    #                 "  + year",
    #                 trainData, status=status, ties="efron")

    mod = smf.phreg(sentence, trainData, status=status, ties="efron")
    rslt = mod.fit()
    print(rslt.summary())
    # 得到h(t|X)=h0(t)exp(X^T*B)的协变量参数B
    params = {}
    i = 0
    while i < len(record_keys):
        params[record_keys[i]] = rslt.params[i]
        i = i + 1
    print(params)
    return trainData, testData, params
Exemplo n.º 2
0
def initial(file, x, y, state, id):
    data = pd.read_csv(file)
    data = data.dropna()
    data["gender"] = (data["gender"] == "Male").astype(int)
    data["Partner"] = (data["Partner"] == "Yes").astype(int)
    data["Dependents"] = (data["Dependents"] == "Yes").astype(int)
    data["PhoneService"] = (data["PhoneService"] == "Yes").astype(int)
    data["MultipleLines"] = data['MultipleLines'].map({
        'No phone service': 0,
        'No': 1,
        'Yes': 2
    })
    data["InternetService"] = data['InternetService'].map({
        'No': 0,
        'DSL': 1,
        'Fiber optic': 2
    })
    data["TechSupport"] = data['TechSupport'].map({
        'No internet service': 0,
        'No': 1,
        'Yes': 2
    })
    data["StreamingTV"] = data['StreamingTV'].map({
        'No internet service': 0,
        'No': 1,
        'Yes': 2
    })
    data["Contract"] = data['Contract'].map({
        'Month-to-month': 0,
        'One year': 1,
        'Two year': 2
    })
    data["PaperlessBilling"] = (data["PaperlessBilling"] == "Yes").astype(int)
    data["PaymentMethod"] = data['PaymentMethod'].map({
        'Electronic check':
        0,
        'Mailed check':
        1,
        'Bank transfer (automatic)':
        2,
        'Credit card (automatic)':
        3
    })
    data["Churn"] = (data["Churn"] == "Yes").astype(int)
    print(data.head())
    print("最开始的数量:", len(data))

    # 切分为训练集和测试集
    titleList = data.columns.values.tolist()
    print(titleList)
    x_keys = []
    for key in x:
        x_keys.append(key)
    x_keys.append(y)
    x_keys.append(state)
    x_keys.append(id)
    y_keys = []
    for a in titleList:
        if a not in x_keys:
            del data[a]
    print(data.head())
    X = data[x_keys]
    Y = data[y_keys]
    seed = 7
    test_size = 0.4
    trainData, testData, ab, cd = train_test_split(X,
                                                   Y,
                                                   test_size=test_size,
                                                   random_state=seed)

    print("切分后训练集data:", len(trainData))
    print("切分后测试集data:", len(testData))

    status = trainData[state].values

    sentence = y + "~"
    count = 0
    for key in record_keys:
        if count == 0:
            sentence = sentence + key
        else:
            sentence = sentence + "+" + key
        count = count + 1
    print(sentence)

    # mod = smf.phreg("futime ~ age + female + creatinine + "
    #                 "  + year",
    #                 trainData, status=status, ties="efron")

    mod = smf.phreg(sentence, trainData, status=status, ties="efron")
    rslt = mod.fit()
    print(rslt.summary())
    # 得到h(t|X)=h0(t)exp(X^T*B)的协变量参数B
    params = {}
    i = 0
    while i < len(record_keys):
        params[record_keys[i]] = rslt.params[i]
        i = i + 1
    print(params)
    return trainData, testData, params
notgotit = EUhist.loc[~EUhist['user'].isin(reached)].groupby(
    'user')['Seniority_days'].last().reset_index()
notgotit['reached'] = 0

dfEU = pd.concat([gotit, notgotit], axis=0)
dfEU = pd.merge(dfEU,
                dta[['Id', 'user_types']],
                left_on='user',
                right_on='Id',
                how='inner',
                validate='1:1')
dfEU = pd.concat(
    [dfEU, pd.get_dummies(dfEU['user_types'], prefix='type')], axis=1)

mod = smf.phreg("Seniority_days ~ 0 + type_2 + type_3",
                status=dfEU['reached'].values,
                data=dfEU,
                ties="efron")
rslt = mod.fit()
print(rslt.summary())

fig, ax = plt.subplots()
for tp in range(1, 4):
    sf = sm.SurvfuncRight(dfEU.loc[dfEU['user_types'] == tp, "Seniority_days"],
                          dfEU.loc[dfEU['user_types'] == tp, "reached"])
    sf.plot(ax)
li = ax.get_lines()
li[1].set_visible(False)  # removes crosses (not clear what they mean)
li[3].set_visible(False)
li[5].set_visible(False)
plt.legend((li[0], li[2], li[4]), ('Type 1', 'Type 2', 'Type 3'))
plt.ylim(0.8, 1)
Exemplo n.º 4
0
print(sm_probit_manual.cov_params())

############################################################
# Cox-proportional
############################################################
data = sm.datasets.get_rdataset("flchain", "survival").data
del data["chapter"]
data = data.dropna()
data["lam"] = data["lambda"]
data["female"] = (data["sex"] == "F").astype(int)
data["year"] = data["sample.yr"] - min(data["sample.yr"])
status = data["death"].values

mod = smf.phreg(
    "futime ~ age + female + creatinine + "
    "np.sqrt(kappa) + np.sqrt(lam) + year + mgus",
    data,
    status=status,
    ties="efron")
rslt = mod.fit()
print(rslt.summary())

############################################################
# phreg on loan data
############################################################
PATH1 = '~/Google Drive/PWBM/CECL/data'
# PATH1 = 'c:/Users/hanjh/Documents/Google Drive/PWBM/CECL/data/fannie_mae'
PATH2 = os.path.join(PATH1, 'fannie_mae', 'clean')
filename = 'noMOD_5M'
YEARS = ['2000Q4', '2001Q4', '2002Q4', '2003Q4', '2004Q4', '2005Q4']

df = pd.read_csv(os.path.join(
Exemplo n.º 5
0
#!/Users/bernardroesler/anaconda3/envs/insight/bin/python3
#==============================================================================
#     File: phreg_statsmodels_ex.py
#  Created: 06/19/2018, 12:17
#   Author: Bernie Roesler
#
"""
  Description:
"""
#==============================================================================
import statsmodels.api as sm
import statsmodels.formula.api as smf

data = sm.datasets.get_rdataset("flchain", "survival").data
del data["chapter"]
data = data.dropna()
data["lam"] = data["lambda"]
data["female"] = (data["sex"] == "F").astype(int)
data["year"] = data["sample.yr"] - min(data["sample.yr"])
status = data["death"].values

mod = smf.phreg("futime ~ 0 + age + female + creatinine + "
                "np.sqrt(kappa) + np.sqrt(lam) + year + mgus",
                data, status=status, ties="efron")
rslt = mod.fit()
print(rslt.summary())

#==============================================================================
#==============================================================================
import pandas as pd
# from sklearn.model_selection import train_test_split
# import statsmodels.api as sm
import statsmodels.formula.api as smf

df = pd.read_csv('df_merged5.csv')

# array = df.values
# predictors = array[:, 2:-1]
# outcome = array[:, -1]
df.head()
# futime = df.iloc[:, 1]
outcome = df.iloc[:, -1]
predictors = df.iloc[:, 1:-1]

mod = smf.phreg("PERMTH_INT ~ DMPFSEQ_x + BDPEXFLR + BDPSCAN",
                predictors,
                status=outcome,
                ties="efron")

result = mod.fit()
print(result.summary())

# X_train, X_test, y_train, y_test = train_test_split(predictors, outcome)