def main(): "main program" app = get_app_title() appf = get_app_file() plotdir = make_plotdir() loans_df, loans_y, test_df, test_y, numeric_vars = load_data() indep_vars = numeric_vars # skip scaling for now, fit score 0.68, predict score 0.64 loans_X = loans_df test_X = test_df clf = KNeighborsClassifier(n_neighbors=11) do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) # plot_predict(plotdir, app, appf, "rawvar", indep_vars, test_df, test_y, pred_y) # add scaling loans_X, my_scaler = scale_train_data(loans_df, print_out=True) test_X = scale_test_data(my_scaler, test_df) # fit score 0.89, predict score 0.87 clf = KNeighborsClassifier(n_neighbors=11) # other params? n_neighbors, leaf_size, algorithm do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y) # fit score 1.00, predict score 0.87, overfit? clf = KNeighborsClassifier(n_neighbors=11, weights='distance') do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) explore_params(loans_X, loans_y, plotdir, app, appf) clf = KNeighborsClassifier(n_neighbors=11) cross_validate(clf, loans_X, loans_y, print_out=True) clf = KNeighborsClassifier(n_neighbors=11) opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir) loans_X, my_scaler = scale_train_data( loans_df[opt_list] ) test_X = scale_test_data(my_scaler, test_df[opt_list]) clf = KNeighborsClassifier(n_neighbors=11) cross_validate(clf, loans_X, loans_y, print_out=True) do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
def main(): "main program" app = get_app_title() appf = get_app_file() loans_df, loans_y, test_df, test_y, numeric_vars = load_data() indep_vars = numeric_vars print("numeric_vars\n", numeric_vars) plotdir = make_plotdir() loans_X = loans_df test_X = test_df clf = gnb() # skip scaling for now, score 87% do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y) loans_X, my_scaler = scale_train_data(loans_df, print_out=True) test_X = scale_test_data(my_scaler, test_df) clf = gnb() # add scaling, score 87% do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "allscale", indep_vars, test_df, test_y, pred_y) # gnb has no meta-parameters to explore, optimize loans_X = loans_df test_X = test_df clf = gnb() # score 84% +- 4% cross_validate(clf, loans_X, loans_y, print_out=True) clf = gnb() # best score 89% +- 4% opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir, rescale=False) # redo with optimized columns loans_X = loans_df[opt_list] test_X = test_df[opt_list] clf = gnb() # best score 89% +- 4% cross_validate(clf, loans_X, loans_y, print_out=True) clf = gnb() # fit score 89%, predict score 91% do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
def xp_accuracy_diff_entropy(): # Generate graphs. graphs1, graphs2 = generate_graphs() # Compute entropy of degree distribution of the generated graphs. info11 = get_infos(graphs1[0:half_num_graphs]) info12 = get_infos(graphs1[half_num_graphs:]) info21 = get_infos(graphs2[0:half_num_graphs]) info22 = get_infos(graphs2[half_num_graphs:]) # Run and save. import pickle import os save_dir = 'outputs/accuracy_diff_entropy/' os.makedirs(save_dir, exist_ok=True) accuracies = {} confidences = {} for kernel_name in Graph_Kernel_List: print() print('Kernel:', kernel_name) accuracies[kernel_name] = [] confidences[kernel_name] = [] for set_i, graphs in enumerate([graphs1, graphs2]): print() print('Graph set', set_i) tmp_graphs = [g.copy() for g in graphs] targets = [0] * half_num_graphs + [1] * half_num_graphs accuracy = 'error' confidence = 'error' try: accuracy, confidence = cross_validate(tmp_graphs, targets, kernel_name, ds_name=str(set_i), output_dir=save_dir) #, n_jobs=1) except Exception as exp: print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('\n' + kernel_name + ', ' + str(set_i) + ':') print(repr(exp)) accuracies[kernel_name].append(accuracy) confidences[kernel_name].append(confidence) pickle.dump(accuracy, open(save_dir + 'accuracy.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb')) pickle.dump(confidence, open(save_dir + 'confidence.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb')) # Save all. pickle.dump(accuracies, open(save_dir + 'accuracies.pkl', 'wb')) pickle.dump(confidences, open(save_dir + 'confidences.pkl', 'wb')) return
def main(): '''Main program.''' app = get_app_title() appf = get_app_file() plotdir = make_plotdir() loans_df, loans_y, test_df, test_y, numeric_vars = load_data() indep_vars = numeric_vars loans_X, my_scaler = scale_train_data(loans_df, print_out=True) test_X = scale_test_data(my_scaler, test_df) clf = svm.SVC(kernel='linear', C=1, cache_size=1000) do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y) explore_params(loans_X, loans_y, plotdir, app, appf) # test optimization sub-method clf = svm.SVC(kernel='linear', C=1, cache_size=1000) indep_vars = ['FICO.Score', 'Amount.Requested', 'Home.Type'] score, sstd, sscores = get_cv_score(clf, indep_vars, loans_df, loans_y) print("cv score: %.5f +- %.5f for %s" % (score, 2.0 * sstd, indep_vars)) # run optimization routine clf = svm.SVC(kernel='linear', C=1, cache_size=1000) opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir) # optimums found all have the same score within std dev: 0.89 +- 0.03 # svm is therefore less influenced by parameters chosen than naive_bayes # repeat results of optimized list and plot clf = svm.SVC(kernel='linear', C=1, cache_size=1000) loans_X, my_scaler = scale_train_data( loans_df[opt_list] ) test_X = scale_test_data(my_scaler, test_df[opt_list]) cross_validate(clf, loans_X, loans_y, print_out=True) # clf should come from opt? or just opt_list? clf = svm.SVC(kernel='linear', C=1, cache_size=1000) do_fit(clf, loans_X, loans_y, print_out=True) # optimum clf model from do_fit, use in do_predict pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
def linear_reg(input_X, input_Y, nb_folds=10): linear = LinearRegression() predictions = cross_validate(input_X, input_Y, linear, nb_folds=nb_folds) residuals = predictions - input_Y ms_error = np.mean(residuals**2) residuals = pd.DataFrame(residuals) residuals.columns = ['residuals'] return predictions, residuals, ms_error
def rand_forest_reg(input_X, input_Y, max_depth=None, max_features="auto", nb_folds=10, n_estimators=100): forest = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth) predictions = cross_validate(input_X, input_Y, forest, nb_folds=nb_folds) residuals = predictions - input_Y ms_error = np.mean(residuals**2) residuals = pd.DataFrame(residuals) residuals.columns = ['residuals'] return predictions, residuals, ms_error
def svr(input_X, input_Y, nb_folds=10, C=1.0, kernel='rbf', degree=3, gamma='auto'): classification = svm.SVR(C=C, kernel=kernel, degree=degree, gamma=gamma) predictions = cross_validate(input_X, input_Y, classification, nb_folds=nb_folds) residuals = predictions - input_Y ms_error = np.mean(residuals**2) residuals = pd.DataFrame(residuals) residuals.columns = ['residuals'] return predictions, residuals, ms_error
def svc(categorical_input_X, class_input_Y, nb_folds=10, C=1.0, kernel='rbf', degree=3, gamma='auto'): classification = svm.SVC(C=C, kernel=kernel, degree=degree, gamma=gamma) predictions = cross_validate(categorical_input_X, class_input_Y, classification, nb_folds=nb_folds) conf_matrix = confusion_matrix(class_input_Y, predictions) errors = (predictions - class_input_Y) errors = np.array([1 if error != 0 else error for error in errors]) error_rate = float(np.sum(errors**2)) / len(class_input_Y) print('Error rate of misclassification: {}'.format(error_rate)) return classification, conf_matrix
def k_nearest_neighbors(input_X, input_Y, nb_folds=10, n_neighbors=15, weights="uniform", algo="auto"): k_nearest = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algo) predictions = cross_validate(input_X, input_Y, k_nearest, nb_folds=nb_folds) residuals = predictions - input_Y ms_error = np.mean(residuals**2) residuals = pd.DataFrame(residuals) residuals.columns = ['residuals'] _log_res = np.log(predictions + 1) - np.log(input_Y + 1) rmsle = np.sqrt(np.mean(_log_res**2)) return predictions, residuals, ms_error, rmsle
for s in STRATEGIES} for num_neighbors in NEIGHBORHOOD_SIZES } for sim in SIMILARITIES } for user in tqdm(users): for sim in SIMILARITIES: for num_neighbors in NEIGHBORHOOD_SIZES: for approach in STRATEGIES: all_scores[sim][num_neighbors][approach].append( cross_validate(data=ratings, demographics=demographics, user_id=user, num_vals=CROSS_VALIDATION_SIZE, num_neighbors=num_neighbors, strategy=approach, demographic_factor=DEMOGRAPHIC_FACTOR, sim_metric=sim, personalities=personalities, personality_factor=1)) # Compute mean scores for sim in SIMILARITIES: for num_neighbors in NEIGHBORHOOD_SIZES: for approach in STRATEGIES: scores = all_scores[sim][num_neighbors][approach] mean_rmse = np.nanmean([s[0] for s in scores]) mean_f1 = np.nanmean([s[1] for s in scores]) mean_scores[sim][num_neighbors][approach] = (mean_rmse, mean_f1) save_to_disk(os.path.join('grid_search', 'grid_search_all_scores.pkl'),
all_labels = set() for f in args.model: label = ".".join(os.path.split(f)[-1].split(".")[:-1]) all_labels.add(label) for seq in utils.read_fasta(f): seqs.append(seq) labels[seq["seqid"]] = label best_args = [12, 7, 1, 100] print "width\tdepth\tperiod\terror_rate" for w in get_range(args.window): for d in get_range(args.depth): if w < d: continue for p in get_range(args.period): errors = utils.cross_validate(seqs, labels, int(args.k), window=w, depth=d, period=p) avg_error = sum(errors) / len(errors) print "%s\t%s\t%s\t%0.3f" % (w, d, p, avg_error) sys.stdout.flush() if avg_error < best_args[-1]: best_args = [w, d, p, avg_error] print "Best paramters: w=%s, d=%s, p=%s with error of %0.3f" % (best_args[0], best_args[1], best_args[2], best_args[3])
# Dictionary that stores a list of tuples of RMSE and F1-Scores for each demographic factor all_scores = {d: {s: [] for s in STRATEGIES} for d in DEMOGRAPHIC_FACTORS} # Stores the computed metric tuple means (rmse, f1) for each demographic factor mean_scores = {d: {s: None for s in STRATEGIES} for d in DEMOGRAPHIC_FACTORS} for user in tqdm(users): for approach in STRATEGIES: for d in DEMOGRAPHIC_FACTORS: all_scores[d][approach].append( cross_validate(data=ratings, demographics=demographics, user_id=user, num_vals=CROSS_VALIDATION_SIZE, num_neighbors=NEIGHBORHOOD_SIZE, strategy=approach, demographic_factor=d, sim_metric=SIMILARITIES[approach], personalities=personalities, personality_factor=PERSONALITY_FACTOR)) # Compute mean scores for d in DEMOGRAPHIC_FACTORS: for approach in STRATEGIES: scores = all_scores[d][approach] mean_rmse = np.nanmean([s[0] for s in scores]) mean_f1 = np.nanmean([s[1] for s in scores]) mean_scores[d][approach] = (mean_rmse, mean_f1) save_to_disk( os.path.join('demographic_factor', 'demographic_factor_all_scores.pkl'),
feats = feats[max(feats.keys())] result_folder = os.path.join(cur_path, 'modelling', 'winner', 'final', 'results') with open(os.path.join(result_folder, '%s.json' % (mod_name)), 'r') as fp: res = json.load(fp) res = res[max(res.keys())] model_weights[0][mod_name] = res feat_selector = FeatureSelector(feats) scale_folder = os.path.join(cur_path, 'modelling', 'winner', 'final', 'scalers', mod_name) scale_path = os.path.join(scale_folder, os.listdir(os.path.join(scale_folder))[0]) scale = load(scale_path) mod_preds = cross_validate(X[feats], Y, model, scale, only_scores=False, njobs=1) mod_preds.rename(columns={0: mod_name}, inplace=True) pred_df = pred_df.join(mod_preds) # pipe = Pipeline([('feature_selection', feat_selector), ('scaler', scale), ('clf', model)]) pred_cols = [i for i in list(pred_df) if i != 'winner'] mod_scores = {} for idx in pred_df.index: mod_scores[idx] = {} row = pred_df.loc[idx] for mod in pred_cols: row_score = logloss(row['winner'], row[mod]) mod_scores[idx][mod] = row_score
def main(): "main program" app = get_app_title() appf = get_app_file() plotdir = make_plotdir() loans_df, loans_y, test_df, test_y, numeric_vars = load_data() indep_vars = numeric_vars # skip scaling for now, score 0.71 loans_X = loans_df test_X = test_df clf = lr() do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "rawvar", indep_vars, test_df, test_y, pred_y) # add scaling, score 0.90 loans_X, my_scaler = scale_train_data(loans_df, print_out=True) test_X = scale_test_data(my_scaler, test_df) clf = lr() do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y) print("columns:", indep_vars) # print_coefs(clf) X_labels = list(loans_df.columns) # print_lr_coefs(clf, X_labels) plist = print_lr_coefs(clf, indep_vars) # find score using only top6 top6 = [p[0] for p in plist[:6]] print("top6:", top6) loans_X = loans_df[top6] test_X = test_df[top6] loans_X, my_scaler = scale_train_data(loans_X, print_out=True) test_X = scale_test_data(my_scaler, test_X) clf = lr() do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) print_lr_coefs(clf, top6) plot_predict(plotdir, app, appf, "top6", top6, test_df, test_y, pred_y) do_roc(clf, test_X, test_y, "top6", top6, app, appf, plotdir) # arr = clf.decision_function(loans_df) # print("decision function:", arr.shape, arr) # shape (1873,) ## clf.decision_function(loans_df) # print_coefs(clf) # traditional coefs in "frequentist" style? # proba = clf.predict_proba(loans_X) # print("proba", proba.shape, proba) explore_params(loans_X, loans_y, plotdir, app, appf) # run optimization routine clf = lr() # init_list = [indep_vars[0], indep_vars[1]] # random_opt(clf, indep_vars, init_list, loans_df, loans_y, print_out=True) opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir, rescale=True) # accuracy 73% +- 3% with no scaling (90% with scaling) # print_coefs(clf) # redo exploration with optimized columns loans_X = loans_df[opt_list] test_X = test_df[opt_list] loans_X, my_scaler = scale_train_data(loans_X, print_out=True) test_X = scale_test_data(my_scaler, test_X) # print("loans_X head\n", loans_X[:3]) explore_params(loans_X, loans_y, plotdir, app, appf+"opt_") # accuracy 73% due to no scaling clf = lr() cross_validate(clf, loans_X, loans_y, print_out=True) clf = lr() do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) print("opt_list columns:", opt_list) # print_coefs(clf) # print_lr_coefs(clf, X_labels) print_lr_coefs(clf, opt_list) plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
users = ratings.UserId.unique() # Dictionary that stores a list of tuples of RMSE and F1-Scores for each personality factor all_scores = {p: [] for p in PERSONALITY_FACTORS} # Stores the computed metric tuple means (rmse, f1) for each personality factor mean_scores = {p: None for p in PERSONALITY_FACTORS} for user in tqdm(users): for p in PERSONALITY_FACTORS: all_scores[p].append( cross_validate(data=ratings, demographics=demographics, user_id=user, num_vals=CROSS_VALIDATION_SIZE, num_neighbors=NEIGHBORHOOD_SIZE, strategy=STRATEGY, demographic_factor=DEMOGRAPHIC_FACTOR, sim_metric=SIMILARITIY, personalities=personalities, personality_factor=p)) # Compute mean scores for p in PERSONALITY_FACTORS: mean_rmse = np.nanmean([s[0] for s in all_scores[p]]) mean_f1 = np.nanmean([s[1] for s in all_scores[p]]) mean_scores[p] = (mean_rmse, mean_f1) save_to_disk( os.path.join('personality_factor', 'personality_factor_all_scores.pkl'), all_scores) save_to_disk(
#!/usr/bin/python import utils import os import sys import random import subprocess if __name__ == "__main__": if len(sys.argv) < 4: print "USAGE: cross_validate.py <n-fold> <class1_samples> <class2_samples>..." sys.exit(1) seqs, labels, all_labels = utils.read_all_labeled(sys.argv[2:]) folds = int(sys.argv[1]) random.shuffle(seqs) errors = utils.cross_validate(seqs, labels, folds) print "Fold errors:", " ".join([str(x) for x in errors]) print "Average error: %3.2f%%" % (sum(errors) * 100.0 / len(errors))