def main(): #getting and splitting data print("parsing data") patients, egm_matrix, cancer_onehot = get_data() train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix, cancer_onehot, test_size=0.20, random_state=0) #form scaled data scaler = preprocessing.StandardScaler().fit(train_X) print(scaler) scaled_train_X = scaler.transform(train_X) scaled_test_X = scaler.transform(test_X) bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=3), max_features=20000, max_samples=1.0, n_estimators=11, random_state=2) bagging.fit(scaled_train_X, train_Y) pred_Y = bagging.predict(scaled_test_X) f1_score_bagging = f1_score(test_Y, pred_Y, average=None) print("f1 score is: " + str(f1_score_bagging) ) #max_features=20000 -> f1 score is: [0.77852349 0.47619048] mean_train_accuracy = bagging.score(scaled_train_X, train_Y) print("mean_train_accuracy is: " + str(mean_train_accuracy)) mean_test_accuracy = bagging.score(scaled_test_X, test_Y) print("mean_test_accuracy is: " + str(mean_test_accuracy)) print(bagging)
def main(): #getting and splitting data print ("parsing data") patients, egm_matrix, cancer_onehot = get_data() train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,cancer_onehot, test_size=0.20, random_state=0) #form scaled data scaler = preprocessing.StandardScaler().fit(train_X) print (scaler) scaled_train_X = scaler.transform(train_X) scaled_test_X = scaler.transform(test_X) rand_forest = RandomForestClassifier(bootstrap=True, criterion='gini', n_estimators=80, max_depth=3, max_features='auto', oob_score=True, random_state=0) rand_forest.fit(scaled_train_X, train_Y) pred_Y = rand_forest.predict(scaled_test_X) rf_f1_score = f1_score(test_Y, pred_Y, average=None) print ("rf_f1_score is: " + str(rf_f1_score)) print ("oob score is: " + str(rand_forest.oob_score_)) rf_mean_train_accuracy = rand_forest.score(scaled_train_X, train_Y) print ("mean rf train accuracy is: " + str(rf_mean_train_accuracy)) rf_mean_test_accuracy = rand_forest.score(scaled_test_X, test_Y) print ("mean rf test accuracy is: " + str(rf_mean_test_accuracy)) print (rand_forest) print (len(rand_forest.feature_importances_)) feat_importance = rand_forest.feature_importances_ considered_feats = 0 total_feats = 0 total_weight = 0 max_weight = 0 for feat in feat_importance: if feat > 0: considered_feats += 1 total_weight += feat max_weight = max(max_weight, feat) total_feats += 1 print ("considered_feats is: " + str(considered_feats)) print ("total_weight is: " + str(total_weight)) print ("max weight is: " + str(max_weight))
def main(): #getting and splitting data print ("parsing data") patients, egm_matrix, cancer_onehot = get_data() train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,cancer_onehot, test_size=0.20, random_state=0) # total_train_ex = 0 # total_train_pos = 0 # for lbl in train_Y: # total_train_ex += 1 # if lbl == 1: # total_train_pos += 1 # print ("total_train_ex: " + str(total_train_ex) + " total_train_pos: " + str(total_train_pos)) # total_test_ex = 0 # total_test_pos = 0 # for lbl in test_Y: # total_test_ex += 1 # if lbl == 1: # total_test_pos += 1 # print ("total_test_ex: " + str(total_test_ex) + " total_test_pos: " + str(total_test_pos)) best_f1_score = f1_score(test_Y, test_Y, average=None) print ("best_f1_score: " + str(best_f1_score)) print ("LR fitting") start = timer() max_allowed_iters = 1500 # lr = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=max_allowed_iters) # lr = LogisticRegression(solver='lbfgs', max_iter=400) lr = LogisticRegression(solver='lbfgs', max_iter=max_allowed_iters) lr.fit(train_X, train_Y) # print ("LR predicting") pred_Y = lr.predict(test_X) end = timer() print ("logistic regression took: " + str(end - start) + " seconds") num_iters = lr.n_iter_ print ("took " + str(num_iters[0]) + " iterations") print ("test accuracy is: " + str(lr.score(test_X, test_Y))) print ("train accuracy is: " + str(lr.score(train_X, train_Y))) score_f1 = f1_score(test_Y, pred_Y, average=None) print ("f1_score is: " + str(score_f1))
def main(): #getting and splitting data print("parsing data") patients, egm_matrix, cancer_onehot = get_data() train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix, cancer_onehot, test_size=0.20, random_state=0) #make y sets +/- 1 # print ("train_Y is: " + str(train_Y)) # for ind in range(train_Y): # if train_Y[ind] == 0: # train_Y[ind] = -1 # for ind2 in range(test_Y): # if test_Y[ind] == 0: # test_Y[ind] = -1 #form scaled data scaler = preprocessing.StandardScaler().fit(train_X) print(scaler) scaled_train_X = scaler.transform(train_X) scaled_test_X = scaler.transform(test_X) ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100, learning_rate=1.0, random_state=0) ada_boost.fit(scaled_train_X, train_Y) pred_Y = ada_boost.predict(scaled_test_X) f1_score_adda = f1_score(test_Y, pred_Y, average=None) print("f1 score is: " + str(f1_score_adda) ) #f1 score is: [0.63703704 0.36363636] for learning rate 1.0 mean_train_accuracy = ada_boost.score(scaled_train_X, train_Y) print("mean_train_accuracy is: " + str(mean_train_accuracy)) mean_test_accuracy = ada_boost.score(scaled_test_X, test_Y) print("mean_test_accuracy is: " + str(mean_test_accuracy)) print(ada_boost)
def main(): #getting and splitting data print("parsing data") patients, egm_matrix, cancer_onehot = get_data() train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix, cancer_onehot, test_size=0.20, random_state=0) #form scaled data scaler = preprocessing.StandardScaler().fit(train_X) print(scaler) scaled_train_X = scaler.transform(train_X) scaled_test_X = scaler.transform(test_X) max_allowed_iters = 1500 print("SGD fitting") start = timer() #test sgd with l1, l2, or elasticnet sgd = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, penalty='l1', shuffle=False) sgd.fit(scaled_train_X, train_Y) pred_Y = sgd.predict(scaled_test_X) end = timer() print("sgd took: " + str(end - start) + " seconds") num_iters = sgd.n_iter_ print("took " + str(num_iters) + " iterations") print("test accuracy is: " + str(sgd.score(scaled_test_X, test_Y))) print("train accuracy is: " + str(sgd.score(scaled_train_X, train_Y))) score_f1 = f1_score(test_Y, pred_Y, average=None) print("f1_score is: " + str(score_f1)) print(sgd)
def main(): #getting and splitting data print("parsing data") patients, egm_matrix, cancer_onehot = get_data() train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix, cancer_onehot, test_size=0.20, random_state=0) #form scaled data scaler = preprocessing.StandardScaler().fit(train_X) print(scaler) scaled_train_X = scaler.transform(train_X) scaled_test_X = scaler.transform(test_X) max_allowed_iters = 1500 print("SVM fitting") start = timer() #test sgd with l1, l2, or elasticnet # sgd = linear_model.SGDClassifier(loss='squared_hinge', max_iter=1000, tol=1e-3, # penalty='l1', shuffle=True) # clf_svm = svm.SVC(C=0.5, gamma=0.2, kernel='rbf', shrinking=False) clf_svm = svm.SVC(kernel='linear', shrinking=False) clf_svm.fit(scaled_train_X, train_Y) pred_Y = clf_svm.predict(scaled_test_X) end = timer() print("sgd took: " + str(end - start) + " seconds") # num_iters = clf_svm.n_iter_ # print ("took " + str(num_iters) + " iterations") print("test accuracy is: " + str(clf_svm.score(scaled_test_X, test_Y))) print("train accuracy is: " + str(clf_svm.score(scaled_train_X, train_Y))) print("pred_Y is: " + str(pred_Y)) score_f1 = f1_score(test_Y, pred_Y, average=None) print("f1_score is: " + str(score_f1)) print(clf_svm)
def main(): #getting and splitting data print("parsing data") patients, egm_matrix, cancer_onehot = get_data() train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix, cancer_onehot, test_size=0.20, random_state=0) #form scaled data scaler = preprocessing.StandardScaler().fit(train_X) print(scaler) scaled_train_X = scaler.transform(train_X) scaled_test_X = scaler.transform(test_X) max_allowed_iters = 1500 print("SGD fitting") #test sgd with l1, l2, or elasticnet alpha_values = [ 0.0001, 0.005, 0.02, 0.03, 0.04, 0.05, 0.07, 0.08, 0.09, 0.1 ] #0.07 is best f1_l1_scores = [] f1_elasticnet_scores = [] f1_l2_scores = [] for alpha_val in alpha_values: sgd_elasticnet = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, penalty='elasticnet', alpha=alpha_val, shuffle=False) sgd_elasticnet.fit(scaled_train_X, train_Y) pred_elasticnet_Y = sgd_elasticnet.predict(scaled_test_X) score_elasticnet_f1 = f1_score(test_Y, pred_elasticnet_Y) print("f1_score is: " + str(score_elasticnet_f1) + "for alpha: " + str(alpha_val)) f1_elasticnet_scores.append(score_elasticnet_f1) sgd_l1 = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, penalty='l1', alpha=alpha_val, shuffle=False) sgd_l1.fit(scaled_train_X, train_Y) pred_l1_Y = sgd_l1.predict(scaled_test_X) score_l1_f1 = f1_score(test_Y, pred_l1_Y) f1_l1_scores.append(score_l1_f1) sgd_l2 = sgd_l1 = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, penalty='l2', alpha=alpha_val, shuffle=False) sgd_l2.fit(scaled_train_X, train_Y) pred_l2_f1 = sgd_l2.predict(scaled_test_X) score_l2_f1 = f1_score(test_Y, pred_l2_f1) f1_l2_scores.append(score_l2_f1) plt.plot(alpha_values, f1_elasticnet_scores, 'ro-', alpha_values, f1_l1_scores, 'g^-', alpha_values, f1_l2_scores, 'b2-') plt.ylabel('f1 score') plt.xlabel("regularization alpha values") plt.title("F1 scores for SGD Classifier with Regularization") plt.show()
def main(): #load_BioAge1HO() #getting and splitting data print("parsing data...") patients, X, y = get_data() print("done.") #form test and train set num_ex, num_features = X.shape train_X_arr = [] train_Y_arr = [] test_X_arr = [] test_Y_arr = [] num_neg_test_ex = 64 num_pos_test_ex = 42 for ind in range(num_ex): lbl = y[ind] if num_pos_test_ex > 0 and lbl == 1: test_Y_arr.append(1) test_X_arr.append(X[ind]) num_pos_test_ex -= 1 elif num_neg_test_ex > 0 and lbl == 0: test_Y_arr.append(0) test_X_arr.append(X[ind]) num_neg_test_ex -= 1 else: train_Y_arr.append(lbl) train_X_arr.append(X[ind]) train_X = np.array(train_X_arr) train_Y = np.array(train_Y_arr) test_X = np.array(test_X_arr) test_Y = np.array(test_Y_arr) #random forests # n_estimators_arr = [50, 75, 90, 95, 100, 105, 110,115] # evaluate_hyperparams(n_estimators_arr, random_forest_classifier, # train_X, train_Y, "Random Forests", "Number of trees per forest") # rf_clf = random_forest_classifier(100) # run_on_test_data(rf_clf, train_X, train_Y, test_X, test_Y, 'random forest test') #sgd w/ l2 regularization alphas = [0.0001, 0.001, 0.01, 0.1, 0.7, 1, 5] # evaluate_hyperparams(alphas, sgd_classifier, train_X, train_Y # "SGD w/ L2 regularization", "alpha values") #bagging max_features_arr = [17000, 18500, 20000, 20500, 21000, 21100, 21200, 22000] # evaluate_hyperparams(max_features_arr, bagging_classifier, train_X, train_Y, # "Bagging", "Maximum Features for Estimator") # bagging_clf = bagging_classifier(21000) # run_on_test_data(bagging_clf, train_X, train_Y, test_X, test_Y, 'bagging test') #adaboost learn_rates = [0.2, 0.4, 0.5, 0.7, 0.9, 1.0] # evaluate_hyperparams(learn_rates, adaBoost_classifier, train_X, train_Y, # "AdaBoost", "Learning Rate") #sigmoid # sigmoid_penalty_terms = [0.2, 0.4, 0.5, 0.7, 0.75, 0.85, 1.0] # evaluate_hyperparams(sigmoid_penalty_terms, svm_classifier, train_X, train_Y, # "Sigmoid Kernel", "Penalty term") sigmoid_coef0_arr = [0.0, 0.2, 0.25, 0.3, 0.4, 0.5, 0.7] # evaluate_hyperparams(sigmoid_coef0_arr, svm_classifier, train_X, train_Y, # "Sigmoid Kernel", "Constant value") # sigmoid_clf = svm_classifier(0.0) # run_on_test_data(sigmoid_clf, train_X, train_Y, test_X, test_Y, 'sigmoid kernel test') #polynomial degrees = [1,2,3,4] # evaluate_hyperparams(degrees, svm_classifier, train_X, train_Y, # "Polynomial Kernel", "Degree of Polynomial") # poly_svm = svm_classifier(1) # run_on_test_data(poly_svm, train_X, train_Y, test_X, test_Y, 'poly test') #rbf svm rbf_gamma_arr = [0.1, 0.2, 0.4, 0.55, 0.6, 0.75, 0.8, 0.85, 0.9] # rbf_penalty_terms = [0.2, 0.4, 0.5, 0.7, 0.75, 0.85] # evaluate_hyperparams(rbf_gamma_arr, svm_classifier, train_X, train_Y, # "RBF kernel", 'Gamma values') # rbf_clf = svm_classifier() # run_on_test_data(rbf_clf, train_X, ) #PCA # retained_variance = [0.5, 0.6, 0.75, 0.85, 0.95] num_components = [75, 80, 85, 90, 95, 100, 110, 115, 120, 125, 130, 135, 140] # evaluate_hyperparams(num_components, PCA_classifier, train_X, train_Y, # "PCA", "Number of Components Retained") # pca_clf = PCA_classifier(110) # run_on_test_data(pca_clf, train_X, train_Y, test_X, test_Y, 'pca test') #decision trees max_depths = [1,2,3,4,5,6] # max_features = ['auto', 'sqrt', None] # evaluate_hyperparams(max_depths, tree_classifier, train_X, train_Y, # "Decision Tree", "Maximum Depth") # dt_clf = tree_classifier(4) # run_on_test_data(dt_clf, train_X, train_Y, test_X, test_Y, "decision tree test") # lr_model = LR_classifier() # lr_results = test_model(lr_model, train_X, train_Y) # print ("lr results:") # print ("avg_train_acc: " + str(lr_results[0])) # print ("avg_test_acc: " + str(lr_results[1])) # print ("avg_prec: " + str(lr_results[2])) # print ("avg_recall: " + str(lr_results[3])) # print ("f1_score: " + str(lr_results[4])) #lr train/test # test_model = LogisticRegression(solver='lbfgs', max_iter=MAX_ITERS) test_model = LR_classifier() test_model.fit(train_X, train_Y) test_pred = test_model.predict(test_X) train_pred = test_model.predict(train_X) train_acc = accuracy_score(train_Y, train_pred) test_acc = accuracy_score(test_Y, test_pred) prec_score = precision_score(test_Y, test_pred) rec_score = recall_score(test_Y, test_pred) lr_f1_score = f1_score(test_Y, test_pred) print ("1's in train_pred: " + str(train_pred.tolist().count(1))) print ("1's in test_pred: " + str(test_pred.tolist().count(1))) print ("test_acc: " + str(test_acc)) print ("train_acc: " + str(train_acc)) print ("prec_score: " + str(prec_score)) print ("rec_score: " + str(rec_score)) print ("f1_score: " + str(lr_f1_score)) lr_conf_matrix = confusion_matrix(test_Y, test_pred) print ("lr confusion matrix is: " + str(lr_conf_matrix))