예제 #1
0
def main():
    #getting and splitting data
    print("parsing data")
    patients, egm_matrix, cancer_onehot = get_data()
    train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,
                                                        cancer_onehot,
                                                        test_size=0.20,
                                                        random_state=0)

    #form scaled data
    scaler = preprocessing.StandardScaler().fit(train_X)
    print(scaler)
    scaled_train_X = scaler.transform(train_X)
    scaled_test_X = scaler.transform(test_X)

    bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=3),
                                max_features=20000,
                                max_samples=1.0,
                                n_estimators=11,
                                random_state=2)

    bagging.fit(scaled_train_X, train_Y)

    pred_Y = bagging.predict(scaled_test_X)

    f1_score_bagging = f1_score(test_Y, pred_Y, average=None)
    print("f1 score is: " + str(f1_score_bagging)
          )  #max_features=20000 -> f1 score is: [0.77852349 0.47619048]

    mean_train_accuracy = bagging.score(scaled_train_X, train_Y)
    print("mean_train_accuracy is: " + str(mean_train_accuracy))
    mean_test_accuracy = bagging.score(scaled_test_X, test_Y)
    print("mean_test_accuracy is: " + str(mean_test_accuracy))
    print(bagging)
예제 #2
0
def main():
    #getting and splitting data
    print ("parsing data")
    patients, egm_matrix, cancer_onehot = get_data()
    train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,cancer_onehot, test_size=0.20, random_state=0)

    #form scaled data
    scaler = preprocessing.StandardScaler().fit(train_X)
    print (scaler)
    scaled_train_X = scaler.transform(train_X)
    scaled_test_X = scaler.transform(test_X)

    rand_forest = RandomForestClassifier(bootstrap=True, criterion='gini', n_estimators=80, max_depth=3, 
    									max_features='auto', oob_score=True, random_state=0)
    rand_forest.fit(scaled_train_X, train_Y)

    pred_Y = rand_forest.predict(scaled_test_X)
    rf_f1_score = f1_score(test_Y, pred_Y, average=None)
    print ("rf_f1_score is: " + str(rf_f1_score))

    print ("oob score is: " + str(rand_forest.oob_score_))

    rf_mean_train_accuracy = rand_forest.score(scaled_train_X, train_Y)
    print ("mean rf train accuracy is: " + str(rf_mean_train_accuracy))

    rf_mean_test_accuracy = rand_forest.score(scaled_test_X, test_Y)
    print ("mean rf test accuracy is: " + str(rf_mean_test_accuracy))

    print (rand_forest)
    print (len(rand_forest.feature_importances_))
    feat_importance = rand_forest.feature_importances_
    considered_feats = 0
    total_feats = 0
    total_weight = 0
    max_weight = 0
    for feat in feat_importance:
    	if feat > 0:
    		considered_feats += 1
    		total_weight += feat
    		max_weight = max(max_weight, feat)
    	total_feats += 1
    print ("considered_feats is: " + str(considered_feats))
    print ("total_weight is: " + str(total_weight))
    print ("max weight is: " + str(max_weight))
예제 #3
0
def main():
    #getting and splitting data
    print ("parsing data")
    patients, egm_matrix, cancer_onehot = get_data()
    train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,cancer_onehot, test_size=0.20, random_state=0)

    # total_train_ex = 0
    # total_train_pos = 0
    # for lbl in train_Y:
    #     total_train_ex += 1
    #     if lbl == 1:
    #         total_train_pos += 1
    # print ("total_train_ex: " + str(total_train_ex) + " total_train_pos: " + str(total_train_pos))

    # total_test_ex = 0
    # total_test_pos = 0
    # for lbl in test_Y:
    #     total_test_ex += 1
    #     if lbl == 1:
    #         total_test_pos += 1
    # print ("total_test_ex: " + str(total_test_ex) + " total_test_pos: " + str(total_test_pos))
    best_f1_score = f1_score(test_Y, test_Y, average=None)
    print ("best_f1_score: " + str(best_f1_score))

    print ("LR fitting")
    start = timer()
    max_allowed_iters = 1500
    # lr = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=max_allowed_iters)
    # lr = LogisticRegression(solver='lbfgs', max_iter=400)
    lr = LogisticRegression(solver='lbfgs', max_iter=max_allowed_iters)
    lr.fit(train_X, train_Y)
    # print ("LR predicting")
    pred_Y = lr.predict(test_X)
    end = timer()
    print ("logistic regression took: " + str(end - start) + " seconds")

    num_iters = lr.n_iter_
    print ("took " + str(num_iters[0]) + " iterations")

    print ("test accuracy is: " + str(lr.score(test_X, test_Y)))
    print ("train accuracy is: " + str(lr.score(train_X, train_Y)))

    score_f1 = f1_score(test_Y, pred_Y, average=None)
    print ("f1_score is: " + str(score_f1))
예제 #4
0
def main():
    #getting and splitting data
    print("parsing data")
    patients, egm_matrix, cancer_onehot = get_data()
    train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,
                                                        cancer_onehot,
                                                        test_size=0.20,
                                                        random_state=0)

    #make y sets +/- 1
    # print ("train_Y is: " + str(train_Y))
    # for ind in range(train_Y):
    # 	if train_Y[ind] == 0:
    # 		train_Y[ind] = -1
    # for ind2 in range(test_Y):
    # 	if test_Y[ind] == 0:
    # 		test_Y[ind] = -1

    #form scaled data
    scaler = preprocessing.StandardScaler().fit(train_X)
    print(scaler)
    scaled_train_X = scaler.transform(train_X)
    scaled_test_X = scaler.transform(test_X)

    ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                                   n_estimators=100,
                                   learning_rate=1.0,
                                   random_state=0)
    ada_boost.fit(scaled_train_X, train_Y)

    pred_Y = ada_boost.predict(scaled_test_X)

    f1_score_adda = f1_score(test_Y, pred_Y, average=None)
    print("f1 score is: " + str(f1_score_adda)
          )  #f1 score is: [0.63703704 0.36363636] for learning rate 1.0

    mean_train_accuracy = ada_boost.score(scaled_train_X, train_Y)
    print("mean_train_accuracy is: " + str(mean_train_accuracy))
    mean_test_accuracy = ada_boost.score(scaled_test_X, test_Y)
    print("mean_test_accuracy is: " + str(mean_test_accuracy))
    print(ada_boost)
예제 #5
0
def main():
    #getting and splitting data
    print("parsing data")
    patients, egm_matrix, cancer_onehot = get_data()
    train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,
                                                        cancer_onehot,
                                                        test_size=0.20,
                                                        random_state=0)

    #form scaled data
    scaler = preprocessing.StandardScaler().fit(train_X)
    print(scaler)
    scaled_train_X = scaler.transform(train_X)
    scaled_test_X = scaler.transform(test_X)

    max_allowed_iters = 1500
    print("SGD fitting")
    start = timer()
    #test sgd with l1, l2, or elasticnet
    sgd = linear_model.SGDClassifier(max_iter=1000,
                                     tol=1e-3,
                                     penalty='l1',
                                     shuffle=False)
    sgd.fit(scaled_train_X, train_Y)
    pred_Y = sgd.predict(scaled_test_X)
    end = timer()
    print("sgd took: " + str(end - start) + " seconds")

    num_iters = sgd.n_iter_
    print("took " + str(num_iters) + " iterations")

    print("test accuracy is: " + str(sgd.score(scaled_test_X, test_Y)))
    print("train accuracy is: " + str(sgd.score(scaled_train_X, train_Y)))

    score_f1 = f1_score(test_Y, pred_Y, average=None)
    print("f1_score is: " + str(score_f1))

    print(sgd)
예제 #6
0
def main():
    #getting and splitting data
    print("parsing data")
    patients, egm_matrix, cancer_onehot = get_data()
    train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,
                                                        cancer_onehot,
                                                        test_size=0.20,
                                                        random_state=0)

    #form scaled data
    scaler = preprocessing.StandardScaler().fit(train_X)
    print(scaler)
    scaled_train_X = scaler.transform(train_X)
    scaled_test_X = scaler.transform(test_X)

    max_allowed_iters = 1500
    print("SVM fitting")
    start = timer()
    #test sgd with l1, l2, or elasticnet
    # sgd = linear_model.SGDClassifier(loss='squared_hinge', max_iter=1000, tol=1e-3,
    #                                 penalty='l1', shuffle=True)
    # clf_svm = svm.SVC(C=0.5, gamma=0.2, kernel='rbf', shrinking=False)
    clf_svm = svm.SVC(kernel='linear', shrinking=False)
    clf_svm.fit(scaled_train_X, train_Y)
    pred_Y = clf_svm.predict(scaled_test_X)
    end = timer()
    print("sgd took: " + str(end - start) + " seconds")

    # num_iters = clf_svm.n_iter_
    # print ("took " + str(num_iters) + " iterations")

    print("test accuracy is: " + str(clf_svm.score(scaled_test_X, test_Y)))
    print("train accuracy is: " + str(clf_svm.score(scaled_train_X, train_Y)))
    print("pred_Y is: " + str(pred_Y))
    score_f1 = f1_score(test_Y, pred_Y, average=None)
    print("f1_score is: " + str(score_f1))
    print(clf_svm)
예제 #7
0
def main():
    #getting and splitting data
    print("parsing data")
    patients, egm_matrix, cancer_onehot = get_data()
    train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,
                                                        cancer_onehot,
                                                        test_size=0.20,
                                                        random_state=0)

    #form scaled data
    scaler = preprocessing.StandardScaler().fit(train_X)
    print(scaler)
    scaled_train_X = scaler.transform(train_X)
    scaled_test_X = scaler.transform(test_X)

    max_allowed_iters = 1500
    print("SGD fitting")
    #test sgd with l1, l2, or elasticnet

    alpha_values = [
        0.0001, 0.005, 0.02, 0.03, 0.04, 0.05, 0.07, 0.08, 0.09, 0.1
    ]  #0.07 is best
    f1_l1_scores = []
    f1_elasticnet_scores = []
    f1_l2_scores = []
    for alpha_val in alpha_values:
        sgd_elasticnet = linear_model.SGDClassifier(max_iter=1000,
                                                    tol=1e-3,
                                                    penalty='elasticnet',
                                                    alpha=alpha_val,
                                                    shuffle=False)
        sgd_elasticnet.fit(scaled_train_X, train_Y)
        pred_elasticnet_Y = sgd_elasticnet.predict(scaled_test_X)

        score_elasticnet_f1 = f1_score(test_Y, pred_elasticnet_Y)
        print("f1_score is: " + str(score_elasticnet_f1) + "for alpha: " +
              str(alpha_val))

        f1_elasticnet_scores.append(score_elasticnet_f1)

        sgd_l1 = linear_model.SGDClassifier(max_iter=1000,
                                            tol=1e-3,
                                            penalty='l1',
                                            alpha=alpha_val,
                                            shuffle=False)
        sgd_l1.fit(scaled_train_X, train_Y)
        pred_l1_Y = sgd_l1.predict(scaled_test_X)

        score_l1_f1 = f1_score(test_Y, pred_l1_Y)
        f1_l1_scores.append(score_l1_f1)

        sgd_l2 = sgd_l1 = linear_model.SGDClassifier(max_iter=1000,
                                                     tol=1e-3,
                                                     penalty='l2',
                                                     alpha=alpha_val,
                                                     shuffle=False)
        sgd_l2.fit(scaled_train_X, train_Y)
        pred_l2_f1 = sgd_l2.predict(scaled_test_X)
        score_l2_f1 = f1_score(test_Y, pred_l2_f1)
        f1_l2_scores.append(score_l2_f1)

    plt.plot(alpha_values, f1_elasticnet_scores, 'ro-', alpha_values,
             f1_l1_scores, 'g^-', alpha_values, f1_l2_scores, 'b2-')
    plt.ylabel('f1 score')
    plt.xlabel("regularization alpha values")
    plt.title("F1 scores for SGD Classifier with Regularization")
    plt.show()
def main():
    #load_BioAge1HO()
    
    #getting and splitting data
    print("parsing data...")
    patients, X, y = get_data()
    print("done.")

    #form test and train set
    num_ex, num_features = X.shape
    train_X_arr = []
    train_Y_arr = []
    test_X_arr = []
    test_Y_arr = []
    num_neg_test_ex = 64
    num_pos_test_ex = 42
    for ind in range(num_ex):
        lbl = y[ind]
        if num_pos_test_ex > 0 and lbl == 1:
            test_Y_arr.append(1)
            test_X_arr.append(X[ind])
            num_pos_test_ex -= 1
        elif num_neg_test_ex > 0 and lbl == 0:
            test_Y_arr.append(0)
            test_X_arr.append(X[ind])
            num_neg_test_ex -= 1
        else:
            train_Y_arr.append(lbl)
            train_X_arr.append(X[ind])

    train_X = np.array(train_X_arr)
    train_Y = np.array(train_Y_arr)
    test_X = np.array(test_X_arr)
    test_Y = np.array(test_Y_arr)


    #random forests
    # n_estimators_arr = [50, 75, 90, 95, 100, 105, 110,115]
    # evaluate_hyperparams(n_estimators_arr, random_forest_classifier,
                            # train_X, train_Y, "Random Forests", "Number of trees per forest")
    # rf_clf = random_forest_classifier(100)
    # run_on_test_data(rf_clf, train_X, train_Y, test_X, test_Y, 'random forest test')
    #sgd w/ l2 regularization
    alphas = [0.0001, 0.001, 0.01, 0.1, 0.7, 1, 5]
    # evaluate_hyperparams(alphas, sgd_classifier, train_X, train_Y 
                        # "SGD w/ L2 regularization", "alpha values")



    #bagging
    max_features_arr = [17000, 18500, 20000, 20500, 21000, 21100, 21200, 22000]
    # evaluate_hyperparams(max_features_arr, bagging_classifier, train_X, train_Y,
                        # "Bagging", "Maximum Features for Estimator")
    # bagging_clf = bagging_classifier(21000)
    # run_on_test_data(bagging_clf, train_X, train_Y, test_X, test_Y, 'bagging test')

    #adaboost
    learn_rates = [0.2, 0.4, 0.5, 0.7, 0.9, 1.0]
    # evaluate_hyperparams(learn_rates, adaBoost_classifier, train_X, train_Y, 
    					# "AdaBoost", "Learning Rate")
    
    #sigmoid
    # sigmoid_penalty_terms = [0.2, 0.4, 0.5, 0.7, 0.75, 0.85, 1.0]
    # evaluate_hyperparams(sigmoid_penalty_terms, svm_classifier, train_X, train_Y,
    					# "Sigmoid Kernel", "Penalty term")
    sigmoid_coef0_arr = [0.0, 0.2, 0.25, 0.3, 0.4, 0.5, 0.7]
    # evaluate_hyperparams(sigmoid_coef0_arr, svm_classifier, train_X, train_Y,
    					# "Sigmoid Kernel", "Constant value")
    # sigmoid_clf = svm_classifier(0.0)
    # run_on_test_data(sigmoid_clf, train_X, train_Y, test_X, test_Y, 'sigmoid kernel test')

    #polynomial
    degrees = [1,2,3,4]
    # evaluate_hyperparams(degrees, svm_classifier, train_X, train_Y,
                        # "Polynomial Kernel", "Degree of Polynomial")
    # poly_svm = svm_classifier(1)
    # run_on_test_data(poly_svm, train_X, train_Y, test_X, test_Y, 'poly test')

    #rbf svm
    rbf_gamma_arr = [0.1, 0.2, 0.4, 0.55, 0.6, 0.75, 0.8, 0.85, 0.9]
    # rbf_penalty_terms = [0.2, 0.4, 0.5, 0.7, 0.75, 0.85]
    # evaluate_hyperparams(rbf_gamma_arr, svm_classifier, train_X, train_Y,
    					# "RBF kernel", 'Gamma values')
    # rbf_clf = svm_classifier()
    # run_on_test_data(rbf_clf, train_X, )

    #PCA
    # retained_variance = [0.5, 0.6, 0.75, 0.85, 0.95]
    num_components = [75, 80, 85, 90, 95, 100, 110, 115, 120, 125, 130, 135, 140]
    # evaluate_hyperparams(num_components, PCA_classifier, train_X, train_Y,
                        # "PCA", "Number of Components Retained")
    # pca_clf = PCA_classifier(110)
    # run_on_test_data(pca_clf, train_X, train_Y, test_X, test_Y, 'pca test')

    #decision trees
    max_depths = [1,2,3,4,5,6]
    # max_features = ['auto', 'sqrt', None]
    # evaluate_hyperparams(max_depths, tree_classifier, train_X, train_Y,
                        # "Decision Tree", "Maximum Depth")
    # dt_clf = tree_classifier(4)
    # run_on_test_data(dt_clf, train_X, train_Y, test_X, test_Y, "decision tree test")


    # lr_model = LR_classifier()
    # lr_results = test_model(lr_model, train_X, train_Y)
    # print ("lr results:")
    # print ("avg_train_acc: " + str(lr_results[0]))
    # print ("avg_test_acc: " + str(lr_results[1]))
    # print ("avg_prec: " + str(lr_results[2]))
    # print ("avg_recall: " + str(lr_results[3]))
    # print ("f1_score: " + str(lr_results[4]))

    #lr train/test
    # test_model = LogisticRegression(solver='lbfgs', max_iter=MAX_ITERS)
    test_model = LR_classifier()
    test_model.fit(train_X, train_Y)
    test_pred = test_model.predict(test_X)
    train_pred = test_model.predict(train_X)
    train_acc = accuracy_score(train_Y, train_pred)
    test_acc = accuracy_score(test_Y, test_pred)
    prec_score = precision_score(test_Y, test_pred)
    rec_score = recall_score(test_Y, test_pred)
    lr_f1_score = f1_score(test_Y, test_pred)
    print ("1's in train_pred: " + str(train_pred.tolist().count(1)))
    print ("1's in test_pred: " + str(test_pred.tolist().count(1)))
    print ("test_acc: " + str(test_acc))
    print ("train_acc: " + str(train_acc))
    print ("prec_score: " + str(prec_score))
    print ("rec_score: " + str(rec_score))
    print ("f1_score: " + str(lr_f1_score))
    lr_conf_matrix = confusion_matrix(test_Y, test_pred)
    print ("lr confusion matrix is: " + str(lr_conf_matrix))