示例#1
0
def readfile():
    cwd = os.getcwd()
    filename = cwd + '/default of credit card clients.xls'
    nanDict = {}
    df = pd.read_excel(filename,
                       header=1,
                       skiprows=0,
                       index_col=0,
                       na_values=nanDict)

    df.rename(
        index=str,
        columns={"default payment next month": "defaultPaymentNextMonth"},
        inplace=True)
    df.rename(columns={"PAY_0": "PAY_1"}, inplace=True)

    #Remove outliers
    df = df[df.MARRIAGE != 0]
    df = df[df.EDUCATION != 0]
    df = df[df.EDUCATION != 5]
    df = df[df.EDUCATION != 6]

    #Categorize PAY_# columns
    for i in reversed(range(1, 7, 1)):
        df.insert(5, "PAY_" + str(i) + "_CAT", 0)

    columns = [df.PAY_1, df.PAY_2, df.PAY_3, df.PAY_4, df.PAY_5, df.PAY_6]

    for i, p in enumerate(columns):
        df.loc[p <= 0, "PAY_" + str(i + 1) + "_CAT"] = p
        df.loc[p <= 0, "PAY_" + str(i + 1)] = 0
        df['PAY_' + str(i + 1) + '_CAT'] = df['PAY_' + str(i + 1) +
                                              '_CAT'].fillna(value=0)

    df1 = df.pop('LIMIT_BAL')
    df2 = df.pop('AGE')
    df3 = df.pop('defaultPaymentNextMonth')
    df['LIMIT_BAL'] = df1
    df['AGE'] = df2
    df['defaultPaymentNextMonth'] = df3

    X = df.loc[:, df.columns != 'defaultPaymentNextMonth'].values
    y = df.loc[:, df.columns == 'defaultPaymentNextMonth'].values

    onehotencoder = OneHotEncoder(categories="auto")

    X = ColumnTransformer([
        ("", onehotencoder, np.arange(1, 9, 1)),
    ],
                          remainder="passthrough").fit_transform(X)

    X = X.astype('float32')
    y = y.astype('float32')

    return X, y
示例#2
0
plt.show()
'''

# Create the independent and dependent variables
X = df.loc[:, df.columns != 'defaultPayment'].values
y = df.loc[:, df.columns == 'defaultPayment'].values

# Categorical variables to one-hots
onehotencoder = OneHotEncoder(categories="auto")
X = ColumnTransformer([
    ("", onehotencoder, [2, 3]),
], remainder="passthrough").fit_transform(X)
Y_onehot = onehotencoder.fit_transform(y).toarray()

# Make sure it's all integers, no float - to save memory/computational time?
X.astype(int)
y.astype(int)

# Split and scale the data
seed = 0
Xtrain, Xtest, ytrain, ytest, Y_train_onehot, Y_test_onehot = train_test_split(
    X, y, Y_onehot, test_size=0.2, random_state=seed)

sc = StandardScaler()
Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)
'''
# PCA
pca = PCA(.97)  #Aim to keep 97% of variance and let algorithm find the appropriate number of principal components-
pca.fit(Xtrain)
Xtrain = pca.transform(Xtrain)
# Sort out the different types of data, looking for
# Amount given in credit (0)
# social info (1-4) [gender, education, marital status, age]
# History of past payment (5-10): how many months have client delayed
# Amount owed (11-16)
# Amount of previous payment: (17-22)

onehotencoder = OneHotEncoder(categories="auto")

X = ColumnTransformer([
    ("", onehotencoder, [1, 2, 3]),
],
                      remainder="passthrough").fit_transform(X)

# Normalize X:
X = X.astype(float)[:] / np.max(X[:])

#print(X.shape)


def accuracy(p, y):
    return (np.sum(np.round(np.round(p).ravel()) == y.ravel())) / len(p)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

##########################
## My own class method  ##
##########################

mySolver = ph.LogisticRegression()