def get_forest_model(data=data, balanced=False, model_name='model_forest_unbalanced_ori.pkl'): if balanced == True: X_train, X_test, y_train, y_test = get_balanced_data(data) else: X = data.drop('class', axis=1) y = data['class'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) # Number of trees in random forest n_estimators = [10, 100] # [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # Number of features to consider at every split #max_features = ['auto'] # ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [3, 5, 10] # [int(x) for x in np.linspace(10, 110, num = 11)] # Minimum number of samples required to split a node min_samples_split = [2, 10, 20] #criterion = ['gini', 'entropy'] # Minimum number of samples required at each leaf node # min_samples_leaf = [1, 2] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid random_grid = {'n_estimators': n_estimators, # 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, # 'min_samples_leaf': min_samples_leaf, #'criterion': criterion, 'bootstrap': bootstrap} # Use the random grid to search for best hyperparameters # First create the base model to tune rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores rf_random = GridSearchCV(estimator=rf, param_grid=random_grid, cv=3, verbose=2, n_jobs=7) rf_random.fit(X_train, y_train) clf = rf_random.best_estimator_ print(rf_random.best_params_) model = clf.fit(X_train, y_train) path = '1) classification algorithms/random forest/credit card fraud/' + model_name with open(path, 'wb') as file: pickle.dump(model, file) return
def get_SVM_model(data=data, kernel='rbf', gamma=0.0001, C=1000, probability=True, model_name='model_SVM_balanced.pkl'): X_train, X_test, y_train, y_test = get_balanced_data(data) # Create SVM classifer object clf = svm.SVC(kernel=kernel, gamma=gamma, C=C, probability=probability) # Train SVM classifer model = clf.fit(X_train, y_train) path = '1) classification algorithms/SVM/credit card fraud/'+model_name with open(path, 'wb') as file: pickle.dump(model, file) return
from global_functions import plot_confusion_matrix, cm_analysis np.random.seed(7) # load the data file_name = 'data/credit card fraud/data_creditcard.pkl' # set working directory to MSc Project data = pd.read_pickle(file_name) # unbalanced data X = data.drop('class', axis=1) y = data['class'] X_train_unbalanced, X_test_unbalanced, y_train_unbalanced, y_test_unbalanced = train_test_split( X, y, test_size=0.25, random_state=1) # balanced data # even out the data set -> 1:1 ratio of fraud and non fraud X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = get_balanced_data( data) # unpack unbalanced model path = '1) classification algorithms/random forest/credit card fraud/model_forest_unbalanced_ori.pkl' with open(path, 'rb') as file: unbalanced_model = pickle.load(file) # unpack balanced model path = '1) classification algorithms/random forest/credit card fraud/model_forest_balanced_ori.pkl' with open(path, 'rb') as file: balanced_model = pickle.load(file) # predict labels unbalanced_predictions = unbalanced_model.predict(X_test_unbalanced) unbalanced_predictions = [int(round(x)) for x in unbalanced_predictions] balanced_predictions = balanced_model.predict(X_test_balanced)
import pickle from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from keras.models import Sequential from keras.layers import Dense from keras.optimizers import Adam from global_functions import get_balanced_data import numpy numpy.random.seed(7) # load the data file_name = 'data/customer churn/customer churn modified.pkl' # set working directory to MSc Project data = pd.read_pickle(file_name) X_train, X_test, y_train, y_test = get_balanced_data(data) def get_NN_model(data=data, lr=0.001, loss='sparse_categorical_crossentropy', metrics=['accuracy'], validation_split=0.2, batch_size=25, epochs=20, shuffle=True, verbose=2, model_name='model_NN_balanced_churn.pkl'): # create the neural net n_inputs = X_train.shape[1]
def get_accuracies(data): X_train, X_test, y_train, y_test = get_balanced_data(data) seed = 1 rfc = RandomForestClassifier(bootstrap=True, max_depth=10, max_features='auto', min_samples_leaf=2, min_samples_split=10, n_estimators=500) rfc2 = RandomForestClassifier(bootstrap=False, max_depth=2, max_features='auto', min_samples_leaf=5, min_samples_split=20, n_estimators=100) gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25, loss='deviance', learning_rate=0.1, max_depth=5, max_features='auto', criterion='friedman_mse', n_estimators=100) def baseline_model(optimizer='adam', learn_rate=0.01): model = Sequential() model.add(Dense(100, input_dim=X_train.shape[1], activation='relu')) model.add( Dense(50, activation='relu') ) # 8 is the dim/ the number of hidden units (units are the kernel) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) return model keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0, optimizer='Adam') outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed) svm = SVC(gamma="scale", probability=True, kernel='rbf', C=0.5) models = [('GBM', gbm), ('RFC', rfc), ('RFC2', rfc2), ('Keras', keras), ('SVM', svm)] results = [] names = [] scoring = 'accuracy' accuracy = [] for name, model in models: cv_results = cross_val_score(model, X_train, y_train, cv=outer_cv, scoring=scoring) results.append(cv_results) names.append(name) # msg = "Cross-validation Accuracy %s: %f (+/- %f )" % (name, cv_results.mean() * 100, cv_results.std() * 100) # print(msg) model.fit(X_train, y_train) # print('Test set accuracy: {:.2f}'.format(model.score(X_test, y_test) * 100), '%') # accuracy.append(name) accuracy.append(model.score(X_test, y_test)) return accuracy