示例#1
0
def main():
    "main program"
    app = get_app_title()
    appf = get_app_file()
    plotdir = make_plotdir()
    
    loans_df, loans_y, test_df, test_y, numeric_vars = load_data()
    indep_vars = numeric_vars
    
    # skip scaling for now, fit score 0.68, predict score 0.64
    loans_X = loans_df
    test_X = test_df
    clf = KNeighborsClassifier(n_neighbors=11)
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
#    plot_predict(plotdir, app, appf, "rawvar", indep_vars, test_df, test_y, pred_y)
    
    # add scaling
    loans_X, my_scaler = scale_train_data(loans_df, print_out=True)
    test_X = scale_test_data(my_scaler, test_df)
    
    # fit score 0.89, predict score 0.87
    clf = KNeighborsClassifier(n_neighbors=11)
# other params? n_neighbors, leaf_size, algorithm
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
    plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y)
    
    # fit score 1.00, predict score 0.87, overfit?
    clf = KNeighborsClassifier(n_neighbors=11, weights='distance')
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
    
    explore_params(loans_X, loans_y, plotdir, app, appf)
    
    clf = KNeighborsClassifier(n_neighbors=11)
    cross_validate(clf, loans_X, loans_y, print_out=True)
    
    clf = KNeighborsClassifier(n_neighbors=11)
    opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir)
    
    loans_X, my_scaler = scale_train_data( loans_df[opt_list] )
    test_X = scale_test_data(my_scaler, test_df[opt_list])
    
    clf = KNeighborsClassifier(n_neighbors=11)
    cross_validate(clf, loans_X, loans_y, print_out=True)
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
    plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
示例#2
0
def main():
    "main program"
    
    app = get_app_title()
    appf = get_app_file()
    
    loans_df, loans_y, test_df, test_y, numeric_vars = load_data()
    indep_vars = numeric_vars
    print("numeric_vars\n", numeric_vars)
    
    plotdir = make_plotdir()
    
    loans_X = loans_df
    test_X = test_df
    clf = gnb()         # skip scaling for now, score 87%
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y)
    
    loans_X, my_scaler = scale_train_data(loans_df, print_out=True)
    test_X = scale_test_data(my_scaler, test_df)
    clf = gnb()     # add scaling, score 87%
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "allscale", indep_vars, test_df, test_y, pred_y)
    
    # gnb has no meta-parameters to explore, optimize
    
    loans_X = loans_df
    test_X = test_df
    clf = gnb()   # score 84% +- 4%
    cross_validate(clf, loans_X, loans_y, print_out=True)
    
    clf = gnb()    # best score 89% +- 4%
    opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir, rescale=False)
    
    # redo with optimized columns
    loans_X = loans_df[opt_list]
    test_X = test_df[opt_list]
    clf = gnb()         # best score 89% +- 4%
    cross_validate(clf, loans_X, loans_y, print_out=True)
    
    clf = gnb()         # fit score 89%, predict score 91%
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
def xp_accuracy_diff_entropy():
	
	# Generate graphs.
	graphs1, graphs2 = generate_graphs()

	
	# Compute entropy of degree distribution of the generated graphs.
	info11 = get_infos(graphs1[0:half_num_graphs])
	info12 = get_infos(graphs1[half_num_graphs:])
	info21 = get_infos(graphs2[0:half_num_graphs])
	info22 = get_infos(graphs2[half_num_graphs:])

	# Run and save.
	import pickle
	import os
	save_dir = 'outputs/accuracy_diff_entropy/'
	os.makedirs(save_dir, exist_ok=True)

	accuracies = {}
	confidences = {}
	
	for kernel_name in Graph_Kernel_List:
		print()
		print('Kernel:', kernel_name)
		
		accuracies[kernel_name] = []
		confidences[kernel_name] = []
		for set_i, graphs in enumerate([graphs1, graphs2]):
			print()
			print('Graph set', set_i)
			
			tmp_graphs = [g.copy() for g in graphs]
			targets = [0] * half_num_graphs + [1] * half_num_graphs
			
			accuracy = 'error'
			confidence = 'error'
			try:
				accuracy, confidence = cross_validate(tmp_graphs, targets, kernel_name, ds_name=str(set_i), output_dir=save_dir) #, n_jobs=1)
			except Exception as exp:
				print('An exception occured when running this experiment:')
				LOG_FILENAME = save_dir + 'error.txt'
				logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
				logging.exception('\n' + kernel_name + ', ' + str(set_i) + ':')
				print(repr(exp))
			accuracies[kernel_name].append(accuracy)
			confidences[kernel_name].append(confidence)
			
			pickle.dump(accuracy, open(save_dir + 'accuracy.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb'))
			pickle.dump(confidence, open(save_dir + 'confidence.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb'))
		
	# Save all.	
	pickle.dump(accuracies, open(save_dir + 'accuracies.pkl', 'wb'))	
	pickle.dump(confidences, open(save_dir + 'confidences.pkl', 'wb'))	
	
	return
示例#4
0
def main():
    '''Main program.'''
    app = get_app_title()
    appf = get_app_file()
    plotdir = make_plotdir()
    
    loans_df, loans_y, test_df, test_y, numeric_vars = load_data()
    indep_vars = numeric_vars
    loans_X, my_scaler = scale_train_data(loans_df, print_out=True)
    test_X = scale_test_data(my_scaler, test_df)
    
    clf = svm.SVC(kernel='linear', C=1, cache_size=1000)
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)   
    plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y)
    
    explore_params(loans_X, loans_y, plotdir, app, appf)
    
    # test optimization sub-method
    clf = svm.SVC(kernel='linear', C=1, cache_size=1000)
    indep_vars = ['FICO.Score', 'Amount.Requested', 'Home.Type']
    score, sstd, sscores = get_cv_score(clf, indep_vars, loans_df, loans_y)
    print("cv score: %.5f +- %.5f for %s" % (score, 2.0 * sstd, indep_vars))

#   run optimization routine
    clf = svm.SVC(kernel='linear', C=1, cache_size=1000)
    opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir)
# optimums found all have the same score within std dev: 0.89 +- 0.03
# svm is therefore less influenced by parameters chosen than naive_bayes

#   repeat results of optimized list and plot
    clf = svm.SVC(kernel='linear', C=1, cache_size=1000)
    loans_X, my_scaler = scale_train_data( loans_df[opt_list] )
    test_X = scale_test_data(my_scaler, test_df[opt_list])
    cross_validate(clf, loans_X, loans_y, print_out=True)
    
    # clf should come from opt?  or just opt_list?
    clf = svm.SVC(kernel='linear', C=1, cache_size=1000)
    do_fit(clf, loans_X, loans_y, print_out=True)
    # optimum clf model from do_fit, use in do_predict
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
    plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
def linear_reg(input_X, input_Y, nb_folds=10):

    linear = LinearRegression()
    predictions = cross_validate(input_X, input_Y, linear, nb_folds=nb_folds)
    residuals = predictions - input_Y
    ms_error = np.mean(residuals**2)

    residuals = pd.DataFrame(residuals)
    residuals.columns = ['residuals']

    return predictions, residuals, ms_error
def rand_forest_reg(input_X,
                    input_Y,
                    max_depth=None,
                    max_features="auto",
                    nb_folds=10,
                    n_estimators=100):

    forest = RandomForestRegressor(n_estimators=n_estimators,
                                   max_features=max_features,
                                   max_depth=max_depth)
    predictions = cross_validate(input_X, input_Y, forest, nb_folds=nb_folds)
    residuals = predictions - input_Y
    ms_error = np.mean(residuals**2)

    residuals = pd.DataFrame(residuals)
    residuals.columns = ['residuals']

    return predictions, residuals, ms_error
示例#7
0
def svr(input_X,
        input_Y,
        nb_folds=10,
        C=1.0,
        kernel='rbf',
        degree=3,
        gamma='auto'):

    classification = svm.SVR(C=C, kernel=kernel, degree=degree, gamma=gamma)

    predictions = cross_validate(input_X,
                                 input_Y,
                                 classification,
                                 nb_folds=nb_folds)

    residuals = predictions - input_Y
    ms_error = np.mean(residuals**2)

    residuals = pd.DataFrame(residuals)
    residuals.columns = ['residuals']

    return predictions, residuals, ms_error
示例#8
0
def svc(categorical_input_X,
        class_input_Y,
        nb_folds=10,
        C=1.0,
        kernel='rbf',
        degree=3,
        gamma='auto'):

    classification = svm.SVC(C=C, kernel=kernel, degree=degree, gamma=gamma)

    predictions = cross_validate(categorical_input_X,
                                 class_input_Y,
                                 classification,
                                 nb_folds=nb_folds)

    conf_matrix = confusion_matrix(class_input_Y, predictions)
    errors = (predictions - class_input_Y)

    errors = np.array([1 if error != 0 else error for error in errors])
    error_rate = float(np.sum(errors**2)) / len(class_input_Y)
    print('Error rate of misclassification: {}'.format(error_rate))

    return classification, conf_matrix
示例#9
0
def k_nearest_neighbors(input_X,
                        input_Y,
                        nb_folds=10,
                        n_neighbors=15,
                        weights="uniform",
                        algo="auto"):

    k_nearest = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
                                              weights=weights,
                                              algorithm=algo)
    predictions = cross_validate(input_X,
                                 input_Y,
                                 k_nearest,
                                 nb_folds=nb_folds)
    residuals = predictions - input_Y
    ms_error = np.mean(residuals**2)

    residuals = pd.DataFrame(residuals)
    residuals.columns = ['residuals']

    _log_res = np.log(predictions + 1) - np.log(input_Y + 1)
    rmsle = np.sqrt(np.mean(_log_res**2))

    return predictions, residuals, ms_error, rmsle
示例#10
0
                        for s in STRATEGIES}
        for num_neighbors in NEIGHBORHOOD_SIZES
    }
    for sim in SIMILARITIES
}

for user in tqdm(users):
    for sim in SIMILARITIES:
        for num_neighbors in NEIGHBORHOOD_SIZES:
            for approach in STRATEGIES:
                all_scores[sim][num_neighbors][approach].append(
                    cross_validate(data=ratings,
                                   demographics=demographics,
                                   user_id=user,
                                   num_vals=CROSS_VALIDATION_SIZE,
                                   num_neighbors=num_neighbors,
                                   strategy=approach,
                                   demographic_factor=DEMOGRAPHIC_FACTOR,
                                   sim_metric=sim,
                                   personalities=personalities,
                                   personality_factor=1))

# Compute mean scores
for sim in SIMILARITIES:
    for num_neighbors in NEIGHBORHOOD_SIZES:
        for approach in STRATEGIES:
            scores = all_scores[sim][num_neighbors][approach]
            mean_rmse = np.nanmean([s[0] for s in scores])
            mean_f1 = np.nanmean([s[1] for s in scores])
            mean_scores[sim][num_neighbors][approach] = (mean_rmse, mean_f1)

save_to_disk(os.path.join('grid_search', 'grid_search_all_scores.pkl'),
示例#11
0
    all_labels = set()

    for f in args.model:
        label = ".".join(os.path.split(f)[-1].split(".")[:-1])
        all_labels.add(label)

        for seq in utils.read_fasta(f):
            seqs.append(seq)
            labels[seq["seqid"]] = label

    best_args = [12, 7, 1, 100]

    print "width\tdepth\tperiod\terror_rate"
    for w in get_range(args.window):
        for d in get_range(args.depth):
            if w < d:
                continue

            for p in get_range(args.period):
                errors = utils.cross_validate(seqs, labels, int(args.k), window=w, depth=d, period=p)
                avg_error = sum(errors) / len(errors)

                print "%s\t%s\t%s\t%0.3f" % (w, d, p, avg_error)
                sys.stdout.flush()
                if avg_error < best_args[-1]:
                    best_args = [w, d, p, avg_error]

    print "Best paramters: w=%s, d=%s, p=%s with error of %0.3f" % (best_args[0], best_args[1], best_args[2], best_args[3])


示例#12
0
# Dictionary that stores a list of tuples of RMSE and F1-Scores for each demographic factor
all_scores = {d: {s: [] for s in STRATEGIES} for d in DEMOGRAPHIC_FACTORS}

# Stores the computed metric tuple means (rmse, f1) for each demographic factor
mean_scores = {d: {s: None for s in STRATEGIES} for d in DEMOGRAPHIC_FACTORS}

for user in tqdm(users):
    for approach in STRATEGIES:
        for d in DEMOGRAPHIC_FACTORS:
            all_scores[d][approach].append(
                cross_validate(data=ratings,
                               demographics=demographics,
                               user_id=user,
                               num_vals=CROSS_VALIDATION_SIZE,
                               num_neighbors=NEIGHBORHOOD_SIZE,
                               strategy=approach,
                               demographic_factor=d,
                               sim_metric=SIMILARITIES[approach],
                               personalities=personalities,
                               personality_factor=PERSONALITY_FACTOR))

# Compute mean scores
for d in DEMOGRAPHIC_FACTORS:
    for approach in STRATEGIES:
        scores = all_scores[d][approach]
        mean_rmse = np.nanmean([s[0] for s in scores])
        mean_f1 = np.nanmean([s[1] for s in scores])
        mean_scores[d][approach] = (mean_rmse, mean_f1)

save_to_disk(
    os.path.join('demographic_factor', 'demographic_factor_all_scores.pkl'),
示例#13
0
        feats = feats[max(feats.keys())]
    result_folder = os.path.join(cur_path, 'modelling', 'winner', 'final',
                                 'results')
    with open(os.path.join(result_folder, '%s.json' % (mod_name)), 'r') as fp:
        res = json.load(fp)
        res = res[max(res.keys())]
    model_weights[0][mod_name] = res
    feat_selector = FeatureSelector(feats)
    scale_folder = os.path.join(cur_path, 'modelling', 'winner', 'final',
                                'scalers', mod_name)
    scale_path = os.path.join(scale_folder,
                              os.listdir(os.path.join(scale_folder))[0])
    scale = load(scale_path)
    mod_preds = cross_validate(X[feats],
                               Y,
                               model,
                               scale,
                               only_scores=False,
                               njobs=1)
    mod_preds.rename(columns={0: mod_name}, inplace=True)
    pred_df = pred_df.join(mod_preds)
#    pipe = Pipeline([('feature_selection', feat_selector), ('scaler', scale), ('clf', model)])

pred_cols = [i for i in list(pred_df) if i != 'winner']

mod_scores = {}
for idx in pred_df.index:
    mod_scores[idx] = {}
    row = pred_df.loc[idx]
    for mod in pred_cols:
        row_score = logloss(row['winner'], row[mod])
        mod_scores[idx][mod] = row_score
示例#14
0
def main():
    "main program"
    app = get_app_title()
    appf = get_app_file()
    plotdir = make_plotdir()
    
    loans_df, loans_y, test_df, test_y, numeric_vars = load_data()
    indep_vars = numeric_vars
    
    # skip scaling for now, score 0.71
    loans_X = loans_df
    test_X = test_df
    clf = lr()
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "rawvar", indep_vars, test_df, test_y, pred_y)

    # add scaling, score 0.90    
    loans_X, my_scaler = scale_train_data(loans_df, print_out=True)
    test_X = scale_test_data(my_scaler, test_df)
    
    clf = lr()
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y)
    print("columns:", indep_vars)
#   print_coefs(clf)
    X_labels = list(loans_df.columns)
#   print_lr_coefs(clf, X_labels)
    plist = print_lr_coefs(clf, indep_vars)

# find score using only top6
    top6 = [p[0] for p in plist[:6]]
    print("top6:", top6)
    loans_X = loans_df[top6]
    test_X = test_df[top6]
    loans_X, my_scaler = scale_train_data(loans_X, print_out=True)
    test_X = scale_test_data(my_scaler, test_X)
    clf = lr()
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
    print_lr_coefs(clf, top6)
    plot_predict(plotdir, app, appf, "top6", top6, test_df, test_y, pred_y)

    do_roc(clf, test_X, test_y, "top6", top6, app, appf, plotdir)
    
#    arr = clf.decision_function(loans_df)
#    print("decision function:", arr.shape, arr)  # shape (1873,)
##    clf.decision_function(loans_df)
#    print_coefs(clf)
# traditional coefs in "frequentist" style?
#    proba = clf.predict_proba(loans_X)
#    print("proba", proba.shape, proba)
    
    explore_params(loans_X, loans_y, plotdir, app, appf)
    
    # run optimization routine
    clf = lr()
#    init_list = [indep_vars[0], indep_vars[1]]
#    random_opt(clf, indep_vars, init_list, loans_df, loans_y, print_out=True)
    opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir, rescale=True)
    # accuracy 73% +- 3% with no scaling  (90% with scaling)
#    print_coefs(clf)

    # redo exploration with optimized columns
    loans_X = loans_df[opt_list]
    test_X = test_df[opt_list]
    loans_X, my_scaler = scale_train_data(loans_X, print_out=True)
    test_X = scale_test_data(my_scaler, test_X)
#    print("loans_X head\n", loans_X[:3])
    explore_params(loans_X, loans_y, plotdir, app, appf+"opt_")
    # accuracy 73% due to no scaling
    
    clf = lr()
    cross_validate(clf, loans_X, loans_y, print_out=True)
    
    clf = lr()
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)
    print("opt_list columns:", opt_list)
#   print_coefs(clf)
#   print_lr_coefs(clf, X_labels)
    print_lr_coefs(clf, opt_list)
    plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
示例#15
0
users = ratings.UserId.unique()

# Dictionary that stores a list of tuples of RMSE and F1-Scores for each personality factor
all_scores = {p: [] for p in PERSONALITY_FACTORS}

# Stores the computed metric tuple means (rmse, f1) for each personality factor
mean_scores = {p: None for p in PERSONALITY_FACTORS}

for user in tqdm(users):
    for p in PERSONALITY_FACTORS:
        all_scores[p].append(
            cross_validate(data=ratings,
                           demographics=demographics,
                           user_id=user,
                           num_vals=CROSS_VALIDATION_SIZE,
                           num_neighbors=NEIGHBORHOOD_SIZE,
                           strategy=STRATEGY,
                           demographic_factor=DEMOGRAPHIC_FACTOR,
                           sim_metric=SIMILARITIY,
                           personalities=personalities,
                           personality_factor=p))

# Compute mean scores
for p in PERSONALITY_FACTORS:
    mean_rmse = np.nanmean([s[0] for s in all_scores[p]])
    mean_f1 = np.nanmean([s[1] for s in all_scores[p]])
    mean_scores[p] = (mean_rmse, mean_f1)

save_to_disk(
    os.path.join('personality_factor', 'personality_factor_all_scores.pkl'),
    all_scores)
save_to_disk(
示例#16
0
#!/usr/bin/python

import utils
import os
import sys
import random
import subprocess

if __name__ == "__main__":
    if len(sys.argv) < 4:
        print "USAGE: cross_validate.py <n-fold> <class1_samples> <class2_samples>..."
        sys.exit(1)

    seqs, labels, all_labels = utils.read_all_labeled(sys.argv[2:])
    folds = int(sys.argv[1])
    
    random.shuffle(seqs)
    errors = utils.cross_validate(seqs, labels, folds)

    print "Fold errors:", " ".join([str(x) for x in errors])
    print "Average error: %3.2f%%" % (sum(errors) * 100.0 / len(errors))