def readfile(): cwd = os.getcwd() filename = cwd + '/default of credit card clients.xls' nanDict = {} df = pd.read_excel(filename, header=1, skiprows=0, index_col=0, na_values=nanDict) df.rename( index=str, columns={"default payment next month": "defaultPaymentNextMonth"}, inplace=True) df.rename(columns={"PAY_0": "PAY_1"}, inplace=True) #Remove outliers df = df[df.MARRIAGE != 0] df = df[df.EDUCATION != 0] df = df[df.EDUCATION != 5] df = df[df.EDUCATION != 6] #Categorize PAY_# columns for i in reversed(range(1, 7, 1)): df.insert(5, "PAY_" + str(i) + "_CAT", 0) columns = [df.PAY_1, df.PAY_2, df.PAY_3, df.PAY_4, df.PAY_5, df.PAY_6] for i, p in enumerate(columns): df.loc[p <= 0, "PAY_" + str(i + 1) + "_CAT"] = p df.loc[p <= 0, "PAY_" + str(i + 1)] = 0 df['PAY_' + str(i + 1) + '_CAT'] = df['PAY_' + str(i + 1) + '_CAT'].fillna(value=0) df1 = df.pop('LIMIT_BAL') df2 = df.pop('AGE') df3 = df.pop('defaultPaymentNextMonth') df['LIMIT_BAL'] = df1 df['AGE'] = df2 df['defaultPaymentNextMonth'] = df3 X = df.loc[:, df.columns != 'defaultPaymentNextMonth'].values y = df.loc[:, df.columns == 'defaultPaymentNextMonth'].values onehotencoder = OneHotEncoder(categories="auto") X = ColumnTransformer([ ("", onehotencoder, np.arange(1, 9, 1)), ], remainder="passthrough").fit_transform(X) X = X.astype('float32') y = y.astype('float32') return X, y
plt.show() ''' # Create the independent and dependent variables X = df.loc[:, df.columns != 'defaultPayment'].values y = df.loc[:, df.columns == 'defaultPayment'].values # Categorical variables to one-hots onehotencoder = OneHotEncoder(categories="auto") X = ColumnTransformer([ ("", onehotencoder, [2, 3]), ], remainder="passthrough").fit_transform(X) Y_onehot = onehotencoder.fit_transform(y).toarray() # Make sure it's all integers, no float - to save memory/computational time? X.astype(int) y.astype(int) # Split and scale the data seed = 0 Xtrain, Xtest, ytrain, ytest, Y_train_onehot, Y_test_onehot = train_test_split( X, y, Y_onehot, test_size=0.2, random_state=seed) sc = StandardScaler() Xtrain = sc.fit_transform(Xtrain) Xtest = sc.transform(Xtest) ''' # PCA pca = PCA(.97) #Aim to keep 97% of variance and let algorithm find the appropriate number of principal components- pca.fit(Xtrain) Xtrain = pca.transform(Xtrain)
# Sort out the different types of data, looking for # Amount given in credit (0) # social info (1-4) [gender, education, marital status, age] # History of past payment (5-10): how many months have client delayed # Amount owed (11-16) # Amount of previous payment: (17-22) onehotencoder = OneHotEncoder(categories="auto") X = ColumnTransformer([ ("", onehotencoder, [1, 2, 3]), ], remainder="passthrough").fit_transform(X) # Normalize X: X = X.astype(float)[:] / np.max(X[:]) #print(X.shape) def accuracy(p, y): return (np.sum(np.round(np.round(p).ravel()) == y.ravel())) / len(p) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) ########################## ## My own class method ## ########################## mySolver = ph.LogisticRegression()