def get_results_protein(): train, test = read_data.get_data("Grupa5_data/protein.RData") x_train = pd.DataFrame(train.iloc[:, 0:2000]) x_test = pd.DataFrame(test) x_train = read_data.normalize_data(x_train) x_test = read_data.normalize_data(x_test) y_train = train.iloc[:, 2000] # param = {'alpha': 1e-2} ridge = Ridge(alpha=0.01) cross_validation.cross_validate(x_train, y_train, ridge)
def crossval(k, configspath, gpu, dataset, savefolder, num_epochs, verbose): """ Cross validates the data. """ if not savefolder: datefolder = None else: if not os.path.exists(savefolder): os.makedirs(savefolder) datasetpath = os.path.join(savefolder, 'dataset') with open(datasetpath, 'wb') as datasetfile: cPickle.dump(dataset, datasetfile, cPickle.HIGHEST_PROTOCOL) if not configspath: configs = network_configs.random_network_configs(k) else: configs = [] for line in open(configspath): line = line.split(' ') configs.append( (int(line[0]), int(line[1]), float(line[2][:-1])) ) if verbose: print(configs) print('Dataset read') print('Cross validation with %d folds' % len(configs)) best_config = cross_validation.cross_validate(configs, dataset, savefolder=savefolder, \ num_epochs=num_epochs, verbose=verbose, gpu=gpu) return best_config
def test_cv(): n = 100 x = np.array([1, 1]) a1 = 10 a2 = 25 y = np.empty((2, n)) np.random.seed(42) y[0] = a1 * x[0] + 3 * np.random.randn(n) y[1] = a2 * x[0] + 5 * np.random.randn(n) k = 5 splitter = ds.KFold(n, k, randomize=True) params = (a1, a2) lamda1 = np.array([0.3, 3, 30]) lamda2 = np.array([0.5, 5, 50]) lamdas_list = [lamda1, lamda2] for verbose, save_x_hats in product([True, False], [True, False]): ret = cross_validate(y, splitter, mle, params, lamdas_list, l2_error, params, verbose, save_x_hats) if save_x_hats: (lamda_stars, lamda_star_indices, error, mean_error, x_hats) = ret else: (lamda_stars, lamda_star_indices, error, mean_error) = ret print(mean_error) assert lamda_star_indices[0] == 1 assert lamda_star_indices[1] == 1
def get_results_cancer(): train, test = read_data.get_data("Grupa5_data/cancer.RData") x_train = pd.DataFrame(train.iloc[0:17737, :]) y_train = train.iloc[17737, :] x_test = pd.DataFrame(test) x_train = x_train.T y_train = y_train.T x_test = x_test.T print(x_train) print(x_test) x_train = read_data.normalize_data(x_train) x_test = read_data.normalize_data(x_test) ridge = Ridge() cross_validation.cross_validate(x_train, y_train, ridge)
def process_lambda_grid_search(id, y, X, fold_count, seed, gd_func, max_iters, gamma, lamb, degree): N, D = X.shape initial_w = np.ones(D) # k-fold cross-validation (w_stars, train_correct_ratios, test_correct_ratios) = \ cross_validation.cross_validate(id, y, X, fold_count, seed, gd_func, initial_w, max_iters, gamma, lamb) filename = "train_clean_avg_L2_degree{degree}_fold{fold}_gamma{gamma}_iter{iter}_lamb{lamb}.pickle".format(degree=degree, fold=fold_count, gamma=gamma, iter=max_iters, lamb=lamb) with open(filename, "wb") as pickle_file: pickle.dump((w_stars, train_correct_ratios, test_correct_ratios), pickle_file)
def main(): # when the attributes have different data range heterogeneous_data = mock_Chinese_stock_price.get_stockset_various() # in this dataset, I have added investment, and employee number, # they all have large numbers and will influence the results significantly without normalization, # then those more important attributes with smaller values may not influence the result and the final result cannot be accurate print "before re-scale/normalization" cv_total_error_unweighted = cross_validation.cross_validate(heterogeneous_data, algr=KNN.get_KNN, trails=100) cv_total_error_weighted = cross_validation.cross_validate(heterogeneous_data, algr=KNN.get_weightedKNN, trails=100) print "cross validation, using un-weighted KNN: ", cv_total_error_unweighted print "cross validation, using weighted KNN: ", cv_total_error_weighted print "after re-scale" scale = [10, 10, 10, 0.00001, 0] scaled_data = rescale(heterogeneous_data, scale) scaled_cv_total_error_unweighted = cross_validation.cross_validate(scaled_data, algr=KNN.get_KNN, trails=100) scaled_cv_total_error_weighted = cross_validation.cross_validate(scaled_data, algr=KNN.get_weightedKNN, trails=100) print "cross validation, using un-weighted KNN: ", scaled_cv_total_error_unweighted print "cross validation, using weighted KNN: ", scaled_cv_total_error_weighted print "after normalization" min_max = [(1, 10), (1, 20), (1, 50), (10000, 10000000)] normalized_data = normalization(heterogeneous_data, min_max) normalized_cv_total_error_unweighted = cross_validation.cross_validate( normalized_data, algr=KNN.get_KNN, trails=100 ) normalized_cv_total_error_weighted = cross_validation.cross_validate( normalized_data, algr=KNN.get_weightedKNN, trails=100 ) print "cross validation, using un-weighted KNN: ", normalized_cv_total_error_unweighted print "cross validation, using weighted KNN: ", normalized_cv_total_error_weighted
def main(): # when the attributes have different data range heterogeneous_data = mock_Chinese_stock_price.get_stockset_various() # in this dataset, I have added investment, and employee number, # they all have large numbers and will influence the results significantly without normalization, # then those more important attributes with smaller values may not influence the result and the final result cannot be accurate print 'before re-scale/normalization' cv_total_error_unweighted = cross_validation.cross_validate(heterogeneous_data, algr = KNN.get_KNN, trails=100) cv_total_error_weighted = cross_validation.cross_validate(heterogeneous_data, algr = KNN.get_weightedKNN, trails=100) print 'cross validation, using un-weighted KNN: ', cv_total_error_unweighted print 'cross validation, using weighted KNN: ', cv_total_error_weighted print 'after re-scale' scale = [10, 10, 10, 0.00001, 0] scaled_data = rescale(heterogeneous_data, scale) scaled_cv_total_error_unweighted = cross_validation.cross_validate(scaled_data, algr = KNN.get_KNN, trails=100) scaled_cv_total_error_weighted = cross_validation.cross_validate(scaled_data, algr = KNN.get_weightedKNN, trails=100) print 'cross validation, using un-weighted KNN: ', scaled_cv_total_error_unweighted print 'cross validation, using weighted KNN: ', scaled_cv_total_error_weighted print 'after normalization' min_max = [(1,10), (1,20), (1,50), (10000, 10000000)] normalized_data = normalization(heterogeneous_data, min_max) normalized_cv_total_error_unweighted = cross_validation.cross_validate(normalized_data, algr = KNN.get_KNN, trails=100) normalized_cv_total_error_weighted = cross_validation.cross_validate(normalized_data, algr = KNN.get_weightedKNN, trails=100) print 'cross validation, using un-weighted KNN: ', normalized_cv_total_error_unweighted print 'cross validation, using weighted KNN: ', normalized_cv_total_error_weighted
def estimate_meta(directories, trainer, range_values, label, static_features): images = load(directories, True, permute=True) results = [] for v, feature in range_values: print("Optimizing %s: %f" % (feature.key(), v)) feature_calculator = [feature] + static_features error_rate = cross_validate(images, feature_calculator, trainer, k=10, verbose=False) print("Error rate: %f" % error_rate) results.append([v, error_rate]) for i in images: i.reset(feature) plot = ScatterPlot(ylabel='error rate', xlabel='param') name = inspect.stack()[1][3] plot.save([label], [results], "result_graphs/" + name)
model2 = Sequential() model2.add(Dense(input_shape=(180,), units=80, activation=sigmoid)) model2.add(Dense(80, activation=sigmoid)) model2.add(Dense(80, activation=sigmoid)) model2.add(Dense(1, activation=linear)) model2.compile(optimizer="adam", loss=mean_squared_error, metrics=['mse']) model3 = Sequential() model3.add(Dense(input_shape=(180,), units=80, activation=sigmoid)) model3.add(Dense(80, activation=sigmoid)) model3.add(Dense(80, activation=sigmoid)) model3.add(Dense(1, activation=linear)) model3.compile(optimizer="adam", loss=mean_squared_error, metrics=['mse']) mses_model1 = cross_validate(model, x_train, y_train, epochs=20, verbose=0) mses_model2 = cross_validate(model2, x_train, y_train, epochs=20, verbose=0) mses_model3 = cross_validate(model3, x_train, y_train, epochs=20, verbose=0) # RNN featu_enc, x, y = get_data(data, dynamic=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1) rnn_model = Sequential() rnn_model.add(Embedding(input_dim=21, output_dim=50, input_length=18)) rnn_model.add(LSTM(100, return_sequences=False)) rnn_model.add(Dense(1, activation=linear)) rnn_model.compile(optimizer="adam", loss=mean_squared_error, metrics=['mse']) rnn_model2 = Sequential() rnn_model2.add(Embedding(input_dim=21, output_dim=50, input_length=18))
# print(x_train) # print(x_train) # print(y_train) # rf = RandomForestRegressor(n_estimators=100) # cross_validation.cross_validate(x_train, y_train, rf) # # rf = RandomForestRegressor(n_estimators=100) # features = feature_engineering.recursive_feature_engineering(rf, x_train, y_train) # # rf = RandomForestRegressor(n_estimators=100) # cross_validation.cross_validate(x_train[features], y_train, rf) # rf.fit(x_train, y_train) # lr = LogisticRegression() # rf = RandomForestRegressor() # dt = DecisionTreeRegressor() xgb = XGBRegressor() # sv = svm.SVR() # model = sv model = xgb model.fit(x_train, y_train) cross_validation.cross_validate(x_train, y_train, model) f_imp = model.feature_importances_ cut_off = np.quantile(f_imp, q=(1-0.0025)) feature_list = np.where(f_imp >= cut_off) features = feature_list # features = feature_engineering.svm_feature_engineering(x_train, y_train) print(features)
""" Launch a cross-validation on three models to see which is best on wine data. Model to test: KNeighborsClassifier, DecisionTreeClassifier and MLPClassifier. Author: Claudio Sousa, David Gonzalez """ from sklearn import datasets from cross_validation import cross_validate, plot_validation, output_csv, normalise_data from models import instanciate_kneighbors_model, instanciate_decisiontree_model, instanciate_mlp_model data = datasets.load_wine() data.data = normalise_data(data.data) models = [ instanciate_kneighbors_model(1, 11), instanciate_decisiontree_model(1, 11), instanciate_mlp_model() ] best_model = cross_validate(data, models, 5, 10) output_csv(models, best_model, "wine") plot_validation(models, best_model)
def forward_stepwise_selection( func_builder: FuncBuilder, data: pd.DataFrame, fs: List[str], e: str, seg_col: str, K: int, ) -> (List[str], Dict[int, float]): best_fs = [[]] while len(fs) > 0: best_score = 0 best_scoring_feature = "" first_check = True for f in fs: tmp_fs = best_fs[-1][:] tmp_fs.append(f) clf = func_builder(tmp_fs, e, seg_col) clf.fit(data) feature_score, _ = clf.score(data, "r2") if first_check or feature_score > best_score: best_score = feature_score best_scoring_feature = f first_check = False best_fs.append(best_fs[-1] + [best_scoring_feature]) fs.remove(best_scoring_feature) scores_by_step = {} res_index = 0 best_score = 0 first_check = True for i in range(len(best_fs)): clf = func_builder(best_fs[i], e, seg_col) r2_scores, _ = cv.cross_validate(clf, data, ["r2"], K) tmp_r2_score = r2_scores["r2"] scores_by_step[i] = tmp_r2_score if first_check or tmp_r2_score > best_score: best_score = tmp_r2_score res_index = i first_check = False return best_fs[res_index], scores_by_step # def forward_stepwise_selection( # df : pd.DataFrame, # fs: List[str], # e: str, # K: int) -> List[str] : # best_features = [] # while (len(best_features) < len(fs)) : # remaining_features = list(set(fs) - set(best_features)) # scores = {} # for f in remaining_features : # tmp = best_features # tmp.append(f) # {x_1, x_2, a} # clf = wrappers.ProyectionRegression( # features=tmp, # explain=e # ) # clf.fit(df) # clf.predict(df) # scores[f] = clf.score(df, "r2") # best_features.append(max(scores.items(), key=operator.itemgetter(1))[0]) # predictor_score = {} # pred_features = [] # for f in best_features: # pred_features.append(f) # clf = wrappers.ProyectionRegression( # features = pred_features, # explain = e, # ) # predictor_score[f] = cv.cross_validate(clf, df, "r2", K) # f = max(predictor_score.items(), key=operator.itemgetter(1))[0] # res = [] # for k in best_features: # if k != f: # res.append(k) # break # res.append(f) # return res
def costf(scale): rescaled_data = rescale(data, scale) cost = cross_validation.cross_validate(rescaled_data, algr, trails) return cost
import json dump_final = [] if(cross_val): dump = [] for i in range(len(model_list)): model = model_list[i] optimizer = optimizer_list[i](model.parameters(),lr=lr_list[i]) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,scheduler_gamma_list[i]) criterion = criterion_list[i] print("Model {} of {} : {}".format(i+1,len(model_list),model.name())) if(cross_val): k_fold = 4 print("Cross validating... on {} folds".format(k_fold)) tr_loss, val_loss, tr_err, val_err = cv.cross_validate(model, criterion,optimizer,scheduler,train_input,train_target,k_fold,batch_size=10,n_epochs=epochs_list[i],n_augmentation=0,verbose=2) print("Mean train error : {}, mean validation error : {}".format(tr_err[-1],val_err[-1])) dump.append((model.name(), " train ", tr_loss, tr_err)) dump.append((model.name(), " validation ", val_loss, val_err)) model.reset() print("Training...") cv.train_model(model,criterion,optimizer,scheduler,train_input,train_target,n_epochs=epochs_list[i],batch_size=10,n_augmentation=0,verbose=2) final_tr_error = cv.evaluate_error(model,train_input,train_target) final_te_error = cv.evaluate_error(model,test_input,test_target) print("Train error = {} ; Test error = {} ".format(final_tr_error,final_te_error)) dump_final.append((model.name(), " train ", final_tr_error.item())) dump_final.append((model.name(), " test " , final_te_error.item()))
shuffle(all_settings) # Save results in a dict, mapping settings to results grid_filename = 'gridsearch2' # If the grid search was started previously, load the results # Otherwise, start from an empty dict if os.path.exists(filepath(grid_filename)): with open(filepath(grid_filename), 'rb') as f: results = pickle.load(f) else: results = {} # For each setting that hasn't already been tried, # apply cross-validation, and save the scores for pre_set, train_set in all_settings: if pre_set + train_set in results: continue print(pre_set, train_set) scores = cross_validate(messages, gold, folds, get_vectoriser, preproc_args=pre_set, train_kwargs=train_kwargs(*train_set)) print(scores) results[pre_set + train_set] = scores with open(filepath(grid_filename), 'wb') as f: pickle.dump(results, f)