def mlr(train_features, train_labels, test_features, names, dummies, net_type, extra_features=False): """ Train the Multiple Linear Regressor from training data with labels and perform predictions on the test data. """ print('\n=== Running Multiple Linear Regression for {0} ==='.format( net_type)) regressor = LinearRegression(n_jobs=-1) if extra_features: train_scaled_tmp, scaler = ml_funcs.apply_scaling( train_features[names], 'MLR', net_type, save_scaler=False) train_scaled = np.concatenate( [train_scaled_tmp, np.array(train_features[dummies])], axis=1) else: train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'MLR', net_type, save_scaler=False) # Fit model to the data. print('>> Training the network <<') starttime = time() regressor.fit(train_scaled, train_labels.to_numpy().T[0]) endtime = time() duration = endtime - starttime print("Time: ", round(duration, 2), "s") # Make sure to only perform predictions when there are test features. # First scale the test features as well. if not test_features.empty: if extra_features: test_scaled_tmp = scaler.transform(test_features[names]) test_scaled = np.concatenate( [test_scaled_tmp, np.array(test_features[dummies])], axis=1) else: test_scaled = scaler.transform(test_features) print('>> Perform predictions <<') starttime = time() predictions = regressor.predict(test_scaled) endtime = time() duration = endtime - starttime print("Time: ", round(duration, 2), "s") return predictions
def cv_rf(train_features, train_labels): """ Apply cross validation for the Random Forest Regressor to find its optimal hyperparameters based on the training data. """ # The number of trees in the random forest. n_estimators = np.linspace(start=50, stop=600, num=12, dtype=int) # The number of features to consider at every split of a node. max_features = ['auto', 'sqrt', 'log2'] # The maximum depth of the trees. max_depth = [int(x) for x in np.linspace(2, 20, num=10, dtype=int)] max_depth.append(None) # The minimum number of samples required to split a node. min_samples_split = np.linspace(5, 50, num=10, dtype=int) # The minimum number of samples required at each leaf node. min_samples_leaf = np.linspace(5, 50, num=10, dtype=int) # The method for selecting the samples for each individual tree. bootstrap = [True, False] # Create a random grid with all parameters. random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} # Use the random grid to search for best hyperparameters. # First create the base model to tune. regressor = RandomForestRegressor(random_state=0) # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, use 12 processor cores. rf_random = RandomizedSearchCV(estimator=regressor, param_distributions=random_grid, scoring='neg_mean_absolute_error', n_iter=75, cv=5, verbose=2, random_state=0, n_jobs=-1) # Scale the features train_scaled, _ = ml_funcs.apply_scaling(train_features, 'RF', 'None', save_scaler=False) # Fit the random search model. search = rf_random.fit(train_scaled, train_labels) # Select the parameters that had the best outcome. print("RFR best estimator:") print(search.best_estimator_) print("RFR best hyperparameters of best estimator:") print(search.best_estimator_.get_params()) print("RFR best hyperparameters of search obj:") print(search.best_params_)
def test_performance(train_features, train_labels, test_features, test_labels, method, env): """ Test how much impact the hyperparameter tuning had by comparing the optimised model to a bare model. """ if method == 'RFR': bare = RandomForestRegressor(random_state=0, n_jobs=-1) if env == 'CBD': optimized = RandomForestRegressor(n_estimators=450, min_samples_split=50, min_samples_leaf=15, max_features='sqrt', max_depth=14, bootstrap=False, n_jobs=-1) elif env in ('suburbs', 'combined'): optimized = RandomForestRegressor(n_estimators=100, min_samples_split=20, min_samples_leaf=5, max_features='sqrt', max_depth=None, bootstrap=True, n_jobs=-1) else: print("Not a valid environment!") return None elif method == 'SVR': bare = LinearSVR(random_state=0) if env == 'CBD': optimized = LinearSVR(tol=1e-4, max_iter=1800, loss='squared_epsilon_insensitive', epsilon=1.0, dual=True, C=1e-3) elif env == 'suburbs': optimized = LinearSVR(tol=1e-5, max_iter=5000, loss='squared_epsilon_insensitive', epsilon=0.0, dual=False, C=1e-4) elif env == 'combined': optimized = LinearSVR(random_state=0, tol=0.0001, max_iter=200, loss='epsilon_insensitive', epsilon=1.0, C=0.01, dual=True) else: print("Not a valid environment!") return None else: print("Not a valid method: choose RFR or SVR.") return None # Scale the features. train_scaled, scaler = ml_funcs.apply_scaling(train_features, method, env, save_scaler=False) test_scaled = scaler.transform(test_features) # Fit the data on the bare model, perform height predictions bare.fit(train_scaled, train_labels) predictions_bare = bare.predict(test_scaled) accuracy_bare = compute_accuracy(predictions_bare, test_labels) bare_mae = mean_absolute_error(predictions_bare, test_labels) # Now do the same for the optimized model. optimized.fit(train_scaled, train_labels) predictions_optimized = optimized.predict(test_scaled) accuracy_optimized = compute_accuracy(predictions_optimized, test_labels) optimized_mae = mean_absolute_error(predictions_optimized, test_labels) return accuracy_bare, bare_mae, accuracy_optimized, optimized_mae
def rf_min_samples_leaf(train_features, train_labels, test_features, test_labels, name): """ Plot the minimum samples required in a leaf against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] samples_start = np.linspace(2, 24, 12, dtype=int) samples_end = np.linspace(25, 750, num=30, dtype=int) min_samples_leaf = np.hstack((samples_start, samples_end)) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'RF', name, save_scaler=False) test_scaled = scaler.transform(test_features) for samples in min_samples_leaf: print("Samples leaf:", samples) randomforest = RandomForestRegressor(min_samples_leaf=samples, n_jobs=-1, random_state=0) randomforest.fit(train_scaled, train_labels) predict_train = randomforest.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = randomforest.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=min_samples_leaf, y=train_results, label='Train') sns.lineplot(x=min_samples_leaf, y=test_results, label='Test') plt.legend(frameon=False, loc='upper right') plt.xlabel('Minimum samples in leaf') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/Min_Samples_Leaf_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def rf_max_depth(train_features, train_labels, test_features, test_labels, name): """ Plot the maximum tree depth against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] # Maximum depth of the tree. max_depth = np.linspace(1, 35, 35, dtype=int) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'RF', name, save_scaler=False) test_scaled = scaler.transform(test_features) for depth in max_depth: print("Depth:", depth) randomforest = RandomForestRegressor(max_depth=depth, n_jobs=-1, random_state=0) randomforest.fit(train_scaled, train_labels) predict_train = randomforest.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = randomforest.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=max_depth, y=train_results, label='Train') sns.lineplot(x=max_depth, y=test_results, label='Test') plt.legend(frameon=False, loc='lower right') plt.xlabel('Maximum tree depth') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/Max_Depth_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def rf_n_estimators(train_features, train_labels, test_features, test_labels, name): """ Plot the number of estimators against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] # The number of trees in the random forest. n_estimators = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'RF', name, save_scaler=False) test_scaled = scaler.transform(test_features) for estimator in n_estimators: print("Num estimators:", estimator) randomforest = RandomForestRegressor(n_estimators=estimator, n_jobs=-1, random_state=0) randomforest.fit(train_scaled, train_labels) predict_train = randomforest.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = randomforest.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=n_estimators, y=train_results, label='Train') sns.lineplot(x=n_estimators, y=test_results, label='Test') plt.legend(frameon=False, loc='lower right') plt.xlabel('Number of estimators') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/N_Estimators_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def svr_C(train_features, train_labels, test_features, test_labels, name): """ Plot C against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] c_values = np.linspace(1e-4, 1, 10) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'SVR', name, save_scaler=False) test_scaled = scaler.transform(test_features) for c_val in c_values: print("C:", c_val) svr = LinearSVR(C=c_val, max_iter=2000, random_state=0) svr.fit(train_scaled, train_labels) predict_train = svr.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = svr.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=c_values, y=train_results, label='Train') sns.lineplot(x=c_values, y=test_results, label='Test') plt.legend(frameon=False, loc='lower right') plt.xlabel('C') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/C_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def cv_svr(train_features, train_labels): """ Perform the k-fold cross validation for the support vector regressor. """ epsilon = [0.0, 0.5, 1.0] tol = [1e-3, 1e-4, 1e-5] C = [1e-4, 1e-3, 1e-2, 0.1, 1.0] loss = ['epsilon_insensitive', 'squared_epsilon_insensitive'] dual = [True, False] max_iter = np.linspace(200, 5000, 25, dtype=int) # Create a random grid with all parameters. random_grid = {'epsilon': epsilon, 'tol': tol, 'C': C, 'loss': loss, 'dual': dual, 'max_iter': max_iter} # Use the random grid to search for best hyperparameters. # First create the base model to tune. svr = LinearSVR(random_state=0) # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, use 12 processor cores. svr_random = RandomizedSearchCV(estimator=svr, param_distributions=random_grid, n_iter=75, cv=5, verbose=2, random_state=0, n_jobs=-1, error_score=0.0) # Scale the features train_scaled, _ = ml_funcs.apply_scaling(train_features, 'SVR', 'None', save_scaler=False) # Fit the random search model. search = svr_random.fit(train_scaled, train_labels) # Select the parameters that had the best outcome. print("SVR best estimator:") print(search.best_estimator_) print("SVR best hyperparameters of best estimator:") print(search.best_estimator_.get_params()) print("SVR best hyperparameters of search obj:") print(search.best_params_)
def svr_maxiter_tolerance(train_features, train_labels, test_features, test_labels, name): """ Plot a combination of the maximum number of iterations and the tolerance against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] tolerances = [1e-3, 1e-4, 1e-5] tol_labels = ['1e-3', '1e-4', '1e-5'] max_iter = np.linspace(100, 5000, 50, dtype=int) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'SVR', name, save_scaler=False) test_scaled = scaler.transform(test_features) for tolerance in tolerances: temp_train = [] temp_test = [] print("Tolerance:", tolerance) for iteration in max_iter: print("Max. iterations:", iteration) svr = LinearSVR(tol=tolerance, max_iter=iteration, random_state=0) svr.fit(train_scaled, train_labels) predict_train = svr.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) temp_train.append(accuracy_train) predict_test = svr.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) temp_test.append(accuracy_test) train_results.append(temp_train) test_results.append(temp_test) fig = plt.figure(figsize=(10, 6)) for i in range(len(train_results)): label_train = 'Train (tol' + tol_labels[i] +')' sns.lineplot(x=max_iter, y=train_results[i], label=label_train) label_test = 'Test (tol' + tol_labels[i] +')' sns.lineplot(x=max_iter, y=test_results[i], label=label_test) plt.legend(frameon=False, loc='lower left', bbox_to_anchor=(1.0, 0.0)) plt.xlabel('Maximum number of iterations') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/MaxIter_Tolerance_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def feature_imp_boxplots(data, env): """ Compute the feature importance based on a random forest. 1) Impurity-based importance 2) Permutation importance Information: https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html """ if env in ('CBD', 'suburbs'): net_type = "split" feature_names = np.array([ 'Area', 'Compactness', '#Neighbours', '#Adjacent Buildings', '#Vertices', 'Length', 'Width', 'Slimness', 'Complexity' ]) elif env == 'combined': net_type = "single" feature_names = np.array([ 'Area', 'Compactness', '#Neighbours', '#Adjacent Buildings', '#Vertices', 'Length', 'Width', 'Slimness', 'Complexity', 'Morphology' ]) else: print("Boxplots feature importance: not a valid option.") return features, labels = ml_funcs.get_features_and_labels(data, net_type, False, [], labels=True) train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.2, random_state=0) train_X_scaled, scaler = ml_funcs.apply_scaling(train_X, 'RF', env) test_X_scaled = scaler.transform(test_X) if env == 'CBD': regressor = RandomForestRegressor(n_estimators=450, min_samples_split=50, min_samples_leaf=15, max_features='sqrt', max_depth=14, bootstrap=False, random_state=0, n_jobs=-1) elif env in ('suburbs', 'combined'): regressor = RandomForestRegressor(n_estimators=100, min_samples_split=20, min_samples_leaf=5, max_features='sqrt', max_depth=None, bootstrap=True, random_state=0, n_jobs=-1) else: print("Not a valid environment type") return regressor.fit(train_X_scaled, train_y) fig = plt.figure(figsize=(6, 4)) sns.set_style("ticks") imp = regressor.feature_importances_ sort_imp = imp.argsort()[::-1] barplot = sns.barplot(imp[sort_imp], feature_names[sort_imp], color='steelblue') barplot.set_xlabel("Importance") fig.tight_layout() sns.despine() if directory_exists("./Figures"): plt.savefig("./Figures/Importances_" + env + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!") print("RF train accuracy: %0.3f" % regressor.score(train_X_scaled, train_y)) print("RF test accuracy: %0.3f" % regressor.score(test_X_scaled, test_y)) result = permutation_importance(regressor, train_X_scaled, train_y, n_repeats=25, random_state=0, n_jobs=-1) sorted_idx = result.importances_mean.argsort()[::-1] sns.set_style("ticks") fig, ax = plt.subplots(figsize=(6, 8)) ax.boxplot(result.importances[sorted_idx].T) ax.set_ylabel("Permutation Importance") ax.set_xticklabels(labels=feature_names[sorted_idx], rotation=45, horizontalalignment='right') fig.tight_layout() sns.despine() if directory_exists("./Figures"): plt.savefig("./Figures/Perm_Importance_" + env + "_Train.pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!") result = permutation_importance(regressor, test_X_scaled, test_y, n_repeats=25, random_state=0, n_jobs=-1) sorted_idx = result.importances_mean.argsort()[::-1] sns.set_style("ticks") fig, ax = plt.subplots(figsize=(6, 8)) ax.boxplot(result.importances[sorted_idx].T) ax.set_ylabel("Permutation Importance") ax.set_xticklabels(labels=feature_names[sorted_idx], rotation=45, horizontalalignment='right') fig.tight_layout() sns.despine() if directory_exists("./Figures"): plt.savefig("./Figures/Perm_Importance_" + env + "_Test.pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def svr(train_features, train_labels, test_features, names, dummies, net_type, extra_features=False): """ Train the Support Vector Regressor from training data with labels and perform predictions on the test data. """ print( '\n=== Running Support Vector Regression for {0} ==='.format(net_type)) regressor = LinearSVR(random_state=0, tol=1e-5, max_iter=5000, loss='squared_epsilon_insensitive', epsilon=0.0, C=0.0001, dual=False) if extra_features: train_scaled_tmp, scaler = ml_funcs.apply_scaling( train_features[names], 'SVR', net_type, save_scaler=False) train_scaled = np.concatenate( [train_scaled_tmp, np.array(train_features[dummies])], axis=1) else: train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'SVR', net_type, save_scaler=False) # Fit model to the data. print('>> Training the network <<') starttime = time() regressor.fit(train_scaled, train_labels.to_numpy().T[0]) endtime = time() duration = endtime - starttime print("Time: ", round(duration, 2), "s") # Make sure to only perform predictions when there are test features. # First scale the test features as well. if not test_features.empty: if extra_features: test_scaled_tmp = scaler.transform(test_features[names]) test_scaled = np.concatenate( [test_scaled_tmp, np.array(test_features[dummies])], axis=1) else: test_scaled = scaler.transform(test_features) print('>> Perform predictions <<') starttime = time() predictions = regressor.predict(test_scaled) endtime = time() duration = endtime - starttime print("Time: ", round(duration, 2), "s") return predictions
def randomforest(train_features, train_labels, test_features, names, dummies, net_type, extra_features=False): """ Train the Random Forest Regressor from training data with labels and perform predictions on the test data. """ print( '\n=== Running Random Forest Regression for {0} ==='.format(net_type)) regressor = RandomForestRegressor(n_estimators=250, max_features='sqrt', random_state=0, n_jobs=-1) # https://stackoverflow.com/questions/43798377/one-hot-encode-categorical-variables-and-scale-continuous-ones-simultaneouely # Only apply feature scaling to the numerical features and not to the one-hot-encoded ones. if extra_features: train_scaled_tmp, scaler = ml_funcs.apply_scaling( train_features[names], 'RFR', net_type, save_scaler=False) train_scaled = np.concatenate( [train_scaled_tmp, np.array(train_features[dummies])], axis=1) else: train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'RFR', net_type, save_scaler=False) # Fit model to the data. print('>> Training the network <<') starttime = time() regressor.fit(train_scaled, train_labels.to_numpy().T[0]) endtime = time() duration_train = endtime - starttime print("Time: ", round(duration_train, 2), "s") importances = list(regressor.feature_importances_) # Make sure to only perform predictions when there are test features. # First scale the test features as well. if not test_features.empty: if extra_features: test_scaled_tmp = scaler.transform(test_features[names]) test_scaled = np.concatenate( [test_scaled_tmp, np.array(test_features[dummies])], axis=1) else: test_scaled = scaler.transform(test_features) print('>> Perform predictions <<') starttime = time() predictions = regressor.predict(test_scaled) endtime = time() duration_predict = endtime - starttime print("Time: ", round(duration_predict, 2), "s") return predictions, importances return importances