def run_logreg(): # CV print() recs, precs, accs = [], [], [] for i in range(len(cv_splits)): print('CV Epoch : ' + str(i + 1)) cv_train, cv_test = train_test_split(cv_splits[i]) cv_train_X, cv_train_Y = get_X_Y(cv_train) cv_test_X, cv_test_Y = get_X_Y(cv_test) mlr_fit = mlr.fit(cv_train_X, cv_train_Y) cv_pred = mlr.predict(cv_test_X) # The Coefficients print('Coefficients : \n', mlr.coef_) # Recall Score recall = r_s(cv_test_Y, cv_pred) print('Recall Score : \n', recall) # Precision Score precision = p_s(cv_test_Y, cv_pred) print('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(cv_test_Y, cv_pred) print('Accuracy Score : \n', accuracy) # Conusion Matix print('Confusion Matrix : \n', c_m(cv_test_Y, cv_pred)) recs.append(recall) precs.append(precision) accs.append(accuracy) print() print('Average Recall Score : %f' % np.mean(recs)) print('Average Precision Score : %f' % np.mean(precs)) print('Average Accuracy Score : %f' % np.mean(accs)) print() # Test test_X, test_Y = get_X_Y(data_test) test_pred = mlr.predict(test_X) # The Coefficients print('Test Coefficients : \n', mlr.coef_) # Recall Score recall = r_s(test_Y, test_pred) print('Recall Score : \n', recall) # Precision Score precision = p_s(test_Y, test_pred) print('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(test_Y, test_pred) print('Accuracy Score : \n', accuracy) # Conusion Matix print('Confusion Matrix : \n', c_m(test_Y, test_pred)) print() return None
def sgs(): # gaussian/rbf print() tg = time.time() cs = [0.1, 0.5, 1.0, 2.0, 5.0] sigmas = [0.1, 0.5, 1.0, 2.0, 4.0] hyperparams = {'C': cs, 'gamma': sigmas} rbf_svc = SVC(kernel='rbf', C=cs, gamma=sigmas, cache_size=4096) rbf_svc_clf = GridSearchCV(rbf_svc, hyperparams, cv=5) cv_train_X, cv_train_Y = get_X_Y(data_cv) rbf_svc_fit = rbf_svc_clf.fit(cv_train_X, cv_train_Y) rbf_svc_res = rbf_svc_clf.cv_results_ rbf_svc_params = rbf_svc_clf.best_params_ rbf_svc_score = rbf_svc_clf.best_score_ test_X, test_Y = get_X_Y(data_test) test_pred = rbf_svc_clf.predict(test_X) # The Coefficients print('Test Estimator : \n', rbf_svc_clf.best_estimator_) # Recall Score recall = r_s(test_Y, test_pred) print('Recall Score : \n', recall) # Precision Score precision = p_s(test_Y, test_pred) print('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(test_Y, test_pred) print('Accuracy Score : \n', accuracy) # Conusion Matix print('Confusion Matrix : \n', c_m(test_Y, test_pred)) tg = time.time() - tg print('Time Secs : %f' % tg) return None
def sls(): # linear print() tl = time.time() cs = [0.1, 0.5, 1.0, 2.0, 5.0] lin_svc = SVC(C=cs, kernel='linear', cache_size=4096) hyperparams = {'C': cs} lin_svc_clf = GridSearchCV(lin_svc, hyperparams, cv=5) cv_train_X, cv_train_Y = get_X_Y(data_cv) lin_svc_fit = lin_svc_clf.fit(cv_train_X, cv_train_Y) lin_svc_res = lin_svc_clf.cv_results_ lin_svc_params = lin_svc_clf.best_params_ lin_svc_score = lin_svc_clf.best_score_ test_X, test_Y = get_X_Y(data_test) test_pred = lin_svc_clf.predict(test_X) # The Coefficients print('Test Estimator : \n', lin_svc_clf.best_estimator_) # Recall Score recall = r_s(test_Y, test_pred) print('Recall Score : \n', recall) # Precision Score precision = p_s(test_Y, test_pred) print('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(test_Y, test_pred) print('Accuracy Score : \n', accuracy) # Conusion Matix print('Confusion Matrix : \n', c_m(test_Y, test_pred)) tl = time.time() - tl print('Time Secs : %f' % tl) return None
def kp(): # polynomial print() tp = time.time() recs, precs, accs = [], [], [] alphas = [1.0] degs = [2.0, 3.0] # M hyperparams = {'alpha': alphas, 'degree': degs} poly_krr = KernelRidge(kernel='poly', alpha=alphas, degree=degs, gamma=1, coef0=1) poly_krr_clf = GridSearchCV(poly_krr, hyperparams, cv=5) for batch in data_batches: batch_train, batch_test = train_test_split(batch) cv_train_X, cv_train_Y = get_X_Y(batch_train) poly_krr_fit = poly_krr_clf.fit(cv_train_X, cv_train_Y) poly_krr_res = poly_krr_clf.cv_results_ poly_krr_params = poly_krr_clf.best_params_ poly_krr_score = poly_krr_clf.best_score_ test_X, test_Y = get_X_Y(batch_test) test_pred = poly_krr_clf.predict(test_X) # The Coefficients print('Test Estimator : \n', poly_krr_clf.best_estimator_) # Recall Score recall = r_s(test_Y, test_pred) print('Recall Score : \n', recall) # Precision Score precision = p_s(test_Y, test_pred) print('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(test_Y, test_pred) print('Accuracy Score : \n', accuracy) # Conusion Matix print('Confusion Matrix : \n', c_m(test_Y, test_pred)) recs.append(recall) precs.append(precision) accs.append(accuracy) print() print('Average Test Recall Score : %f' % np.mean(recs)) print('Average Test Precision Score : %f' % np.mean(precs)) print('Average Test Accuracy Score : %f' % np.mean(accs)) tp = time.time() - tp print('Time Secs : %f' % tp) return None
def kg(): # gaussian/rbf print() tg = time.time() recs, precs, accs = [], [], [] alphas = [1.0] sigmas = [0.1, 0.5, 1.0, 2.0, 4.0] hyperparams = {'alpha': alphas, 'gamma': sigmas} rbf_krr = KernelRidge(kernel='rbf', alpha=alphas, gamma=sigmas) rbf_krr_clf = GridSearchCV(rbf_krr, hyperparams, cv=5) for batch in data_batches: batch_train, batch_test = train_test_split(batch) cv_train_X, cv_train_Y = get_X_Y(batch_train) rbf_krr_fit = rbf_krr_clf.fit(cv_train_X, cv_train_Y) rbf_krr_res = rbf_krr_clf.cv_results_ rbf_krr_params = rbf_krr_clf.best_params_ rbf_krr_score = rbf_krr_clf.best_score_ test_X, test_Y = get_X_Y(batch_test) test_pred = rbf_krr_clf.predict(test_X) # The Coefficients print('Test Estimator : \n', rbf_krr_clf.best_estimator_) # Recall Score recall = r_s(test_Y, test_pred) print('Recall Score : \n', recall) # Precision Score precision = p_s(test_Y, test_pred) print('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(test_Y, test_pred) print('Accuracy Score : \n', accuracy) # Conusion Matix print('Confusion Matrix : \n', c_m(test_Y, test_pred)) recs.append(recall) precs.append(precision) accs.append(accuracy) print() print('Average Test Recall Score : %f' % np.mean(recs)) print('Average Test Precision Score : %f' % np.mean(precs)) print('Average Test Accuracy Score : %f' % np.mean(accs)) tg = time.time() - tg print('Time Secs : %f' % tg) return None
def kl(): # linear print() tl = time.time() recs, precs, accs = [], [], [] alphas = [1.0] lin_krr = KernelRidge(alpha=alphas, kernel='linear') hyperparams = {'alpha': alphas} lin_krr_clf = GridSearchCV(lin_krr, hyperparams, cv=5) for batch in data_batches: batch_train, batch_test = train_test_split(batch) cv_train_X, cv_train_Y = get_X_Y(batch_train) lin_krr_fit = lin_krr_clf.fit(cv_train_X, cv_train_Y) lin_krr_res = lin_krr_clf.cv_results_ lin_krr_params = lin_krr_clf.best_params_ lin_krr_score = lin_krr_clf.best_score_ test_X, test_Y = get_X_Y(batch_test) test_pred = lin_krr_clf.predict(test_X) # The Coefficients print('Test Estimator : \n', lin_krr_clf.best_estimator_) # Recall Score recall = r_s(test_Y, test_pred) print('Recall Score : \n', recall) # Precision Score precision = p_s(test_Y, test_pred) print('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(test_Y, test_pred) print('Accuracy Score : \n', accuracy) # Conusion Matix print('Confusion Matrix : \n', c_m(test_Y, test_pred)) recs.append(recall) precs.append(precision) accs.append(accuracy) print() print('Average Test Recall Score : %f' % np.mean(recs)) print('Average Test Precision Score : %f' % np.mean(precs)) print('Average Test Accuracy Score : %f' % np.mean(accs)) tl = time.time() - tl print('Time Secs : %f' % tl) return None
def sps(): # polynomial print() tp = time.time() cs = [0.1, 0.5, 1.0, 2.0, 5.0] degs = [2.0, 3.0] # M hyperparams = {'C': cs, 'degree': degs} poly_svc = SVC(kernel='poly', C=cs, degree=degs, gamma=1, coef0=1, cache_size=4096) poly_svc_clf = GridSearchCV(poly_svc, hyperparams, cv=5) cv_train_X, cv_train_Y = get_X_Y(data_cv) poly_svc_fit = poly_svc_clf.fit(cv_train_X, cv_train_Y) poly_svc_res = poly_svc_clf.cv_results_ poly_svc_params = poly_svc_clf.best_params_ poly_svc_score = poly_svc_clf.best_score_ test_X, test_Y = get_X_Y(data_test) test_pred = poly_svc_clf.predict(test_X) # The Coefficients print('Test Estimator : \n', poly_svc_clf.best_estimator_) # Recall Score recall = r_s(test_Y, test_pred) print('Recall Score : \n', recall) # Precision Score precision = p_s(test_Y, test_pred) print('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(test_Y, test_pred) print('Accuracy Score : \n', accuracy) # Conusion Matix print('Confusion Matrix : \n', c_m(test_Y, test_pred)) tp = time.time() - tp print('Time Secs : %f' % tp) return None
scoring=scoring) results.append(cv_results) names.append(name) # the data output is: name, mean & std model_res = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(model_res) # LDA performed best print('\n') print('running LDA model i.e. best performing model on validation data....') print('\n') lda = LDA() # first fit the lda instance against the entire training dataset lda.fit(X_train, Y_train) # use the trained model to make predictions againts the validation # feature matrix X_validation predictions = lda.predict(X_validation) # determine the accuracy of the model by scoring against the validation # results vector Y_validation print('accuracy_score:', a_s(Y_validation, predictions)) print('\n') print('confusion_matrix') print('\n') # generate confusion matrix print(c_m(Y_validation, predictions)) print('\n') print('classification_report') print('\n') # generate classification report print(c_r(Y_validation, predictions))
from sklearn.model_selection import train_test_split as t_t_s from sklearn.naive_bayes import GaussianNB as GNB from sklearn.decomposition import PCA from sklearn.mixture import GaussianMixture as GM from sklearn.metrics import accuracy_score as a_s df = sns.load_dataset('iris') x = df.drop('species', axis=1) y = df['species'] xtr, xte, ytr, yte = t_t_s(x, y, test_size=0.25, random_state=0) print(xtr.shape, yte.shape) # (112, 4) (38,) model = GNB() model.fit(xtr, ytr) ypred = model.predict(xte) print("分类准确率:{0:.2%}".format(a_s(yte, ypred))) # dimensionality reduction pca = PCA(n_components=2) new_x = pca.fit_transform(x) xtr_new, xte_new, ytr_new, yte_new = t_t_s(new_x, y, test_size=0.25, random_state=0) print(xtr_new.shape, yte_new.shape) # (112, 2) (38,) model1 = GNB() model1.fit(xtr_new, ytr_new) ypred1 = model1.predict(xte_new) print("PCA后分类准确率:{0:.2%}".format(a_s(yte_new, ypred1))) df['PCA1'] = new_x[:, 0] df['PCA2'] = new_x[:, 1]
acctest = 0.8 N = 50000 good = 0 treeid = 0 for i in range(N): tryclf = rfc(criterion='entropy', max_leaf_nodes=10, min_samples_split=4) #(criterion='entropy', #max_leaf_nodes=10, #min_samples_split=4) tprep = t() tryclf.fit(features_train, labels_train) tfit = t() pred = tryclf.predict(features_test) tpred = t() acc = a_s(labels_test, pred) tacc = t() if acc>acctest: print(f"The accuracy of the random forest {i} is {acc*100:.2f}%") #good = input("is this good enough (1=yes/else no)\n\n>>> ") if good == 1: acctest = acc clf = tryclf treeid = i print(f"Tree finished at accuracy {acctest*100:.2f}%") else: treeid = i acctest = acc clf = tryclf if good == 1:
model_res = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(model_res) # LR performed best print('\n') print('running LR model i.e. best performing model on validation data....') print('\n') lr = LR() # first fit the lr instance against the entire training dataset lr.fit(X_train, Y_train) # use the trained model to make predictions againts the validation # feature matrix X_validation predictions = lr.predict(X_validation) # determine the accuracy of the model by scoring against the validation # results vector Y_validation print('accuracy_score:', a_s(Y_validation, predictions)) print('\n') print('confusion_matrix') print('\n') # generate confusion matrix print(c_m(Y_validation, predictions)) print('\n') print('classification_report') print('\n') # generate classification report print(c_r(Y_validation, predictions)) # load test dataset testfile = './woe-test.csv' dataset = pandas.read_csv(testfile, header=0)
# fit model to training datasets print('\n training d model...') model.fit(X_train, Y_train) # view trained model print('\n model...') print(model) # make predictions for test data print('\n making predictions...') y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = a_s(Y_test, predictions) print("WoE Init Accuracy: %.2f%%" % (accuracy * 100.0)) # load validation dataset validationfile = './woe-test.csv' dataset = pd.read_csv(validationfile, header=0) dataset = dataset.drop(['fil', 'status_log'], axis=1) Y_ext = dataset.filter(['status'], axis=1) print('\n Y_ext.head(5)') print(Y_ext.head(5)) print('\n Y_ext.values') print(Y_ext.values)
tl = time.time() test_pred = lin_krr_clf.predict(test_X) tl = time.time() - tl print ('Time Taken To Test : %f Secs.' % tl) # Round Test Predictions to Avoid Multiclass Continuous Targets Error test_pred = np.round(test_pred) # The Best Estimator print ('Test Estimator : \n', lin_krr_clf.best_estimator_) # Recall Score recall = r_s(test_Y, test_pred, average='micro') print ('Recall Score : \n', recall) # Precision Score precision = p_s(test_Y, test_pred, average='micro') print ('Precision Score : \n', precision) # Accuracy Score accuracy = a_s(test_Y, test_pred) print ('Accuracy Score : \n', accuracy) # Confusion Matix print('Confusion Matrix : \n', c_m(test_Y, test_pred)) print () # Polynomial Kernel Ridge Regression print () # Functions and Parameters alphas = [1.0] degs = [2.0, 3.0, 4.0] # M hyperparams = {'alpha' : alphas, 'degree' : degs} poly_krr = KernelRidge(kernel='poly', alpha=alphas, degree=degs, gamma=1, coef0=1) # Polynomial KRR Initializer poly_krr_clf = GridSearchCV(poly_krr, hyperparams, cv=5)
test_set_size = test_dataset_nomissing.shape[0] print('\n test_set_size...') print(test_set_size) X_train, X_test, Y_train, Y_test = t_t_s(X, Y, test_size=test_set_size, random_state=seed, shuffle=False) # instantiate XGBC class using defaults model = XGBC() # fit model to training datasets print('\n training d model...') model.fit(X_train, Y_train) # view trained model print('\n model...') print(model) # make predictions for test data print('\n making predictions...') y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] train_predictions = model.score(X_train, Y_train) # determine the accuracy of the model by scoring against the test # results vector Y_test print('\n xgb_test_raw_accuracy_score:', a_s(Y_test, predictions)) print('\n xgb_train_raw_accuracy_score:', train_predictions)
test_size=test_set_size, random_state=seed, shuffle=False) # instantiate XGBC class using defaults model = XGBC() # fit model to training datasets print('\n training d model...') model.fit(X_train, Y_train) # view trained model print('\n model...') print(model) # make predictions for test data print('\n making predictions...') y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] train_predictions = model.score(X_train, Y_train) # determine the accuracy of the model by scoring against the test # results vector Y_test print('\n xgb_test_standardized_accuracy_score:', a_s(Y_test, predictions)) print('\n xgb_train_standardized_accuracy_score:', train_predictions) # determine the classification report of the model print('\n xg_classification report:') print('\n') print(c_r(Y_test, predictions, digits=3))
X_train, X_test, Y_train, Y_test = t_t_s(X, Y, test_size=test_set_size, random_state=seed, shuffle=False) # instantiate XGBC class using defaults model = XGBC() # fit model to training datasets print('\ntraining d model...') model.fit(X_train, Y_train) # view trained model print('\nmodel...') print(model) # make predictions for test data print('\nmaking predictions...') y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = a_s(Y_test, predictions) print("Base Accuracy: %.2f%%" % (accuracy * 100.0)) # we will attempt to improve the performance by using an XGBoost specific DMatrix xgbdmat = DMat(X_train, Y_train) # Cheat a bit by using some parameters we can see from thje trained model our_params = {'eta': 0.1, 'seed':0, 'subsample': 1, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':1} cv_xgb = XGBCV(params = our_params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5, metrics = ['error'], early_stopping_rounds = 100)
test_set_size = test_dataset_nomissing.shape[0] print('\n test_set_size...') print(test_set_size) X_train, X_test, Y_train, Y_test = t_t_s(rescaledX, Y, test_size=test_set_size, random_state=seed, shuffle=False) # instantiate XGBC class using defaults model = XGBC() # fit model to training datasets print('\n training d model...') model.fit(X_train, Y_train) # view trained model print('\n model...') print(model) # make predictions for test data print('\n making predictions...') y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] train_predictions = model.score(X_train, Y_train) # determine the accuracy of the model by scoring against the test # results vector Y_test print('\n xgb_test_normalized_accuracy_score:', a_s(Y_test, predictions)) print('\n xgb_train_normalized_accuracy_score:', train_predictions)
return self.y_train[best_index] ## Reading the dataframe iris = pd.read_csv("iris.csv") ## creating feature and target variable for the model X = iris.drop('species', axis=1).values y = iris['species'].values ## spliting the dataset into Training and Testing Dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) ## making a instance of the KNN class: knn knn = KNN() ##fitting the dataset into knn classifier knn.fit(X_train, y_train) ## predicting the species for the test dataset prediction = knn.predict(X_test) ## calculating the accuracy for the given model print(a_s(prediction, y_test)) # In[ ]: