예제 #1
0
def main(num_iters = 5):
    pyodbc.lowercase = False
    conn = pyodbc.connect(r"DRIVER=SQL Server;Trusted_Connection=Yes;DATABASE=pardee_datascience;SERVER=kpwhri_datascience.ghc.org")
    conn.execute("insert into ml_runs (runtime) values (DEFAULT)")
    conn.commit()
    run_id = conn.execute("select max(run_id) as run_id from ml_runs").fetchone()[0]

    xs, ys, acg_preds = fetch_data(n = 80000, connection = conn)
    # print(acg_preds[0:3])
    # print(len(acg_preds))

    for i in range(num_iters):
        # x_train, x_test, y_train, y_test = train_test_split(xs, ys,
        #                                                     test_size=0.33,
        #                                                     random_state=0,
        #                                                     stratify=ys)
        x_train, x_test, y_train, y_test = dev_val_split(xs, ys)

        preds = y_test.copy(deep=True)
        # preds = pd.DataFrame({'hospitalized': y_test})
        print_stats(y_train, 'Training')
        print_stats(y_test, 'Test')

        clf = Perceptron(max_iter=1000, tol = .001) # TODO: find out what these params do (apart from suppress a warning)
        clf.nickname = "perceptron "
        try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test)

        clf = LinearSVC(random_state=i)
        clf.nickname = "LinearSVC"
        try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test)

        clf = RandomForestClassifier(n_jobs=-1, max_leaf_nodes=100, random_state=i)
        clf.nickname = "RandmForest"
        preds["RandmForest"] = try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test)

        clf = GaussianNB()
        clf.nickname = "naive bayes"
        preds["naive_bayes"] = try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test)

        # clf = LinearDiscriminantAnalysis(solver='svd')
        # clf.nickname = "LDA        "
        # preds["LDA"] = try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test)

        clf = svm.SVC(probability=True)
        clf.nickname = "SVM        "
        preds["SVM"] = try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test)

        preds["Random"] = np.random.rand(preds.shape[0])

        preds = pd.merge(preds, acg_preds, left_index=True, right_index=True)

        preds.to_csv(HOME_DIR + "preds_d.csv")

        record_predictions(run_id, conn, preds)

    conn.close()