def main(num_iters = 5): pyodbc.lowercase = False conn = pyodbc.connect(r"DRIVER=SQL Server;Trusted_Connection=Yes;DATABASE=pardee_datascience;SERVER=kpwhri_datascience.ghc.org") conn.execute("insert into ml_runs (runtime) values (DEFAULT)") conn.commit() run_id = conn.execute("select max(run_id) as run_id from ml_runs").fetchone()[0] xs, ys, acg_preds = fetch_data(n = 80000, connection = conn) # print(acg_preds[0:3]) # print(len(acg_preds)) for i in range(num_iters): # x_train, x_test, y_train, y_test = train_test_split(xs, ys, # test_size=0.33, # random_state=0, # stratify=ys) x_train, x_test, y_train, y_test = dev_val_split(xs, ys) preds = y_test.copy(deep=True) # preds = pd.DataFrame({'hospitalized': y_test}) print_stats(y_train, 'Training') print_stats(y_test, 'Test') clf = Perceptron(max_iter=1000, tol = .001) # TODO: find out what these params do (apart from suppress a warning) clf.nickname = "perceptron " try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test) clf = LinearSVC(random_state=i) clf.nickname = "LinearSVC" try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test) clf = RandomForestClassifier(n_jobs=-1, max_leaf_nodes=100, random_state=i) clf.nickname = "RandmForest" preds["RandmForest"] = try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test) clf = GaussianNB() clf.nickname = "naive bayes" preds["naive_bayes"] = try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test) # clf = LinearDiscriminantAnalysis(solver='svd') # clf.nickname = "LDA " # preds["LDA"] = try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test) clf = svm.SVC(probability=True) clf.nickname = "SVM " preds["SVM"] = try_pred(run_id, conn, clf, y_train, y_test, x_train, x_test) preds["Random"] = np.random.rand(preds.shape[0]) preds = pd.merge(preds, acg_preds, left_index=True, right_index=True) preds.to_csv(HOME_DIR + "preds_d.csv") record_predictions(run_id, conn, preds) conn.close()