def split_network(train_suburbs, train_cbds, test_db, tables_test, model_suburbs, scaler_suburbs, model_cbds, scaler_cbds, method, save_predictions, labels, save_model, test_subsets, feature_subset): """ Perform the machine learning based on a split training network that separates the CBDs and Suburban and Rural areas. """ reading_time = [] predict_time = [] if len(train_suburbs) != 0: train_feat_suburb, train_label_suburb = ml_funcs.get_features_and_labels(train_suburbs, "split", test_subsets, feature_subset, labels=True) if len(train_cbds) != 0: train_feat_cbd, train_label_cbd = ml_funcs.get_features_and_labels(train_cbds, "split", test_subsets, feature_subset, labels=True) # A database to perform the tests/ predictions on is specified. if test_db: connection = db_funcs.setup_connection(test_db) connection.autocommit = True cursor = connection.cursor() # If no specific tables are selected, perform predictions # for all tables in the specified testing database. if not tables_test: tables_test = db_funcs.unique_tables(cursor) for table in tables_test: if table == 'cbds': continue print(80*'-') print(80*'-') starttime = time() test_suburbs, test_cbds, _ = db_funcs.read_data(connection, table, training=labels) endtime = time() duration = endtime - starttime reading_time.append(duration) if labels: test_feat_suburbs, test_labels_suburbs = \ ml_funcs.get_features_and_labels(test_suburbs, "split", test_subsets, feature_subset, labels=labels) test_feat_cbds, test_labels_cbds = \ ml_funcs.get_features_and_labels(test_cbds, "split", test_subsets, feature_subset, labels=labels) else: test_feat_suburbs = \ ml_funcs.get_features_and_labels(test_suburbs, "split", test_subsets, feature_subset, labels=labels) test_feat_cbds = ml_funcs.get_features_and_labels(test_cbds, "split", test_subsets, feature_subset, labels=labels) pred_cbds, pred_suburbs = np.array([]), np.array([]) starttime = time() # There is no training data specified, use model. if len(train_suburbs) == 0 and len(train_cbds) == 0: if method == "RFR": # There must be test features for the CBD present. if len(test_feat_cbds) != 0: pred_cbds, imp_cbds = predict_from_model(method, test_feat_cbds, model_cbds, scaler_cbds, 'CBD') else: print("Warning: no CBD data present in test set {0}".format(table)) # There must be test features for the suburbs/rural areas present. if len(test_feat_suburbs) != 0: pred_suburbs, imp_suburbs = predict_from_model(method, test_feat_suburbs, model_suburbs, scaler_suburbs, 'suburbs') else: print("Warning: no rural/suburban data present in test set {0}"\ .format(table)) else: # There must be test features for the CBD present. if len(test_feat_cbds) != 0: pred_cbds = predict_from_model(method, test_feat_cbds, model_cbds, scaler_cbds, 'CBD') else: print("Warning: no CBD data present in test set {0}".format(table)) # There must be test features for the suburbs/rural areas present. if len(test_feat_suburbs) != 0: pred_suburbs = predict_from_model(method, test_feat_suburbs, model_suburbs, scaler_suburbs, 'suburbs') else: print("Warning: no rural/suburban data present in test set {0}"\ .format(table)) # There is training data specified, check which area morphologies are present. else: if method == "RFR": if len(train_suburbs) != 0 and len(test_feat_suburbs) != 0: pred_suburbs, imp_suburbs = train_from_data(method, train_feat_suburb, train_label_suburb, test_feat_suburbs, save_model, 'suburbs') else: print("Warning: training and testing data do not both contain " +\ "suburban/rural data!") if len(train_cbds) != 0 and len(test_feat_cbds) != 0: pred_cbds, imp_cbds = train_from_data(method, train_feat_cbd, train_label_cbd, test_feat_cbds, save_model, 'CBD') else: print("Warning: training and testing data do not both contain CBD data!") else: if len(train_suburbs) != 0 and len(test_feat_suburbs) != 0: pred_suburbs = train_from_data(method, train_feat_suburb, train_label_suburb, test_feat_suburbs, save_model, 'suburbs') else: print("Warning: training and testing data do not both contain " +\ "suburban/rural data!") if len(train_cbds) != 0 and len(test_feat_cbds) != 0: pred_cbds = train_from_data(method, train_feat_cbd, train_label_cbd, test_feat_cbds, save_model, 'CBD') else: print("Warning: training and testing data do not both contain CBD data!") endtime = time() duration = endtime - starttime predict_time.append(duration) # Labels are present: print statistics for the height predictions. if labels: if method == "RFR": if len(pred_suburbs) != 0: ml_funcs.get_statistics(test_labels_suburbs, pred_suburbs, "split", feature_subset, imp_suburbs) generate_plots.plot_cumulative_errors(test_labels_suburbs, pred_suburbs, 'suburbs') if len(pred_cbds) != 0: ml_funcs.get_statistics(test_labels_cbds, pred_cbds, "split", feature_subset, imp_cbds) generate_plots.plot_cumulative_errors(test_labels_cbds, pred_cbds, 'CBDs') else: if len(pred_suburbs) != 0: ml_funcs.get_statistics(test_labels_suburbs, pred_suburbs, "split", feature_subset) generate_plots.plot_cumulative_errors(test_labels_suburbs, pred_suburbs, 'suburbs') if len(pred_cbds) != 0: ml_funcs.get_statistics(test_labels_cbds, pred_cbds, "split", feature_subset) generate_plots.plot_cumulative_errors(test_labels_cbds, pred_cbds, 'CBD') # Store predictions in database. if save_predictions: if len(pred_suburbs) != 0: height_values = list(zip(test_suburbs.id, pred_suburbs)) db_funcs.store_predictions(cursor, height_values, table, method, 'split') if len(pred_cbds) != 0: height_values = list(zip(test_cbds.id, pred_cbds)) db_funcs.store_predictions(cursor, height_values, table, method, 'split') db_funcs.close_connection(connection, cursor) print("\n>> Total duration (s) of reading data " + \ "into dataframes: {0} ({1})".format(sum(reading_time), timedelta(seconds=sum(reading_time)))) print("\n>> Total duration (s) of the building " + \ " height predictions: {0} ({1})".format(sum(predict_time), timedelta(seconds=sum(predict_time)))) # No test database is specified, only train the model based on the training data. # Useful when training and storing a model to a file. else: if len(train_suburbs) != 0: train_from_data(method, train_feat_suburb, train_label_suburb, np.array([]), save_model, 'suburbs') if len(train_cbds) != 0: train_from_data(method, train_feat_cbd, train_label_cbd, np.array([]), save_model, 'CBD')
def single_network(train_data, test_db, tables_test, model, scaler, method, save_predictions, labels, save_model, test_subsets, feature_subset): """ Perform the machine learning based on a single training network that combines the CBDs and Suburban and Rural areas. """ reading_time = [] predict_time = [] if len(train_data) != 0: train_features, train_labels = ml_funcs.get_features_and_labels(train_data, "single", test_subsets, feature_subset, labels=True) # A database to perform the tests/ predictions on is specified. if test_db: connection = db_funcs.setup_connection(test_db) connection.autocommit = True cursor = connection.cursor() # If no specific tables are selected, perform predictions # for all tables in the specified testing database. if not tables_test: tables_test = db_funcs.unique_tables(cursor) for table in tables_test: if table == 'cbds': continue print(80*'-') print(80*'-') starttime = time() _, _, test_data = db_funcs.read_data(connection, table, training=labels) endtime = time() duration = endtime - starttime reading_time.append(duration) if labels: test_features, test_labels = ml_funcs.get_features_and_labels(test_data, "single", test_subsets, feature_subset, labels=labels) else: test_features = ml_funcs.get_features_and_labels(test_data, "single", test_subsets, feature_subset, labels=labels) starttime = time() if len(train_data) == 0: if method == "RFR": predictions, importances = predict_from_model(method, test_features, model, scaler, 'combined') else: predictions = predict_from_model(method, test_features, model, scaler, 'combined') else: if method == "RFR": predictions, importances = train_from_data(method, train_features, train_labels, test_features, save_model, 'combined') else: predictions = train_from_data(method, train_features, train_labels, test_features, save_model, 'combined') endtime = time() duration = endtime - starttime predict_time.append(duration) # Labels are present: print statistics for the height predictions. if labels: if method == "RFR": ml_funcs.get_statistics(test_labels, predictions, "single", feature_subset, importances) else: ml_funcs.get_statistics(test_labels, predictions, "single", feature_subset) generate_plots.plot_cumulative_errors(test_labels, predictions, 'combined',) # Store predictions in database. if save_predictions: height_values = list(zip(test_data.id, predictions)) db_funcs.store_predictions(cursor, height_values, table, method, 'combined') db_funcs.close_connection(connection, cursor) print("\n>> Total duration (s) of reading data " + \ "into dataframes: {0} ({1})".format(sum(reading_time), timedelta(seconds=sum(reading_time)))) print("\n>> Total duration (s) of the building " + \ " height predictions: {0} ({1})".format(sum(predict_time), timedelta(seconds=sum(predict_time)))) # No test database is specified, only train the model based on the training data. # Useful when training and storing a model to a file. else: if len(train_features) != 0: train_from_data(method, train_features, train_labels, np.array([]), save_model, 'combined')
def test_geom_features_single(data, cursor, table, store_results, method): """ Only include geometric features during the training and prediction process. Based on a single training network. """ features = [ "area", "compactness", "num_neighbours", "num_adjacent_blds", "num_vertices", "length", "width", "slimness", "complexity", "cbd" ] labels = ["rel_height"] dummies = [] X_train, X_test, y_train, y_test = train_test_split(data[features], data[labels], test_size=0.75, random_state=42) y_test = y_test.to_numpy().T[0] if method == "RFR": predictions, importances = randomforest(X_train, y_train, X_test, features, dummies, "combined", extra_features=False) print_statistics(features, importances, predictions, y_test, "combined", method) # generate_plots.plot_cumulative_errors(y_test, predictions, 'combined') elif method == "MLR": predictions = mlr(X_train, y_train, X_test, features, dummies, "combined", extra_features=False) print_statistics(features, None, predictions, y_test, "combined", method) # generate_plots.plot_cumulative_errors(y_test, predictions, 'combined') elif method == "SVR": predictions = svr(X_train, y_train, X_test, features, dummies, "combined", extra_features=False) print_statistics(features, None, predictions, y_test, "combined", method) # generate_plots.plot_cumulative_errors(y_test, predictions, 'combined') else: print("Not a valid method.") return if store_results: name = method + "_geometric_single" height_vals = list(zip(data.loc[X_test.index].id, predictions)) db_funcs.store_predictions(cursor, height_vals, table, name, 'combined') # Negative: underestimation, positive: overestimation # Store relative error and the percentage error in the database. rel_errors = (predictions - y_test) perc_error = ((predictions - y_test) / y_test) * 100 error_val = list(zip(data.loc[X_test.index].id, rel_errors, perc_error)) db_funcs.store_errors(cursor, error_val, table, name, 'combined')
def test_all_features_split(data_suburb, data_cbd, cursor, table, store_results, method): """ Include both geometric and non-geometric features during the training and prediction process. Based on a split training network of suburbs/ rural areas and CBDs. Source: https://blog.cambridgespark.com/robust-one-hot-encoding-in-python-3e29bfcec77e """ # Create the dummy columns (one hot encoding) for the categorical data. cat_columns = ['bldg_type'] suburb_processed = pd.get_dummies(data_suburb, prefix_sep="__", columns=cat_columns) cbd_processed = pd.get_dummies(data_cbd, prefix_sep="__", columns=cat_columns) # Extract the names from the dummy columns for later use. cat_dummies_suburb = [col for col in suburb_processed if "__" in col \ and col.split("__")[0] in cat_columns] cat_dummies_cbd = [col for col in cbd_processed if "__" in col \ and col.split("__")[0] in cat_columns] # Create list of features so we can extract the data from the dataframe. CBD and suburbs # may have separatere categorical features present. features_general = [ "area", "compactness", "num_neighbours", "num_adjacent_blds", "num_vertices", "length", "width", "slimness", "complexity", "avg_hh_income", "avg_hh_size", "pop_density", "h_mean", "num_amenities" ] features_suburb = features_general + cat_dummies_suburb features_cbd = features_general + cat_dummies_cbd labels = ["rel_height"] # Split the data into a training and testing set. X_sub_train, X_sub_test, y_sub_train, y_sub_test = \ train_test_split(suburb_processed[features_suburb], suburb_processed[labels], test_size=0.75, random_state=42) X_cbd_train, X_cbd_test, y_cbd_train, y_cbd_test = \ train_test_split(cbd_processed[features_cbd], cbd_processed[labels], test_size=0.75, random_state=42) y_sub_test = y_sub_test.to_numpy().T[0] y_cbd_test = y_cbd_test.to_numpy().T[0] if method == "RFR": # Run the random forest regressor for the suburban data and print the results. pred_suburbs, imp_suburbs = randomforest(X_sub_train, y_sub_train, X_sub_test, features_general, cat_dummies_suburb, "suburbs", extra_features=True) print_statistics(features_suburb, imp_suburbs, pred_suburbs, y_sub_test, "suburbs", method) # generate_plots.plot_cumulative_errors(y_sub_test, pred_suburbs, 'suburbs') # Run the random forest regressor for the CBD data and print the results. pred_cbd, imp_cbd = randomforest(X_cbd_train, y_cbd_train, X_cbd_test, features_general, cat_dummies_cbd, "CBD", extra_features=True) print_statistics(features_cbd, imp_cbd, pred_cbd, y_cbd_test, "CBD", method) # generate_plots.plot_cumulative_errors(y_cbd_test, pred_cbd, 'CBD') elif method == "MLR": # Run the multiple linear regressor for the suburban data and print the results. pred_suburbs = mlr(X_sub_train, y_sub_train, X_sub_test, features_general, cat_dummies_suburb, "suburbs", extra_features=True) print_statistics(features_suburb, None, pred_suburbs, y_sub_test, "suburbs", method) # generate_plots.plot_cumulative_errors(y_sub_test, pred_suburbs, 'suburbs') # Run the multiple linear regressor for the CBD data and print the results. pred_cbd = mlr(X_cbd_train, y_cbd_train, X_cbd_test, features_general, cat_dummies_cbd, "CBD", extra_features=True) print_statistics(features_cbd, None, pred_cbd, y_cbd_test, "CBD", method) # generate_plots.plot_cumulative_errors(y_cbd_test, pred_cbd, 'CBD') elif method == "SVR": # Run the support vector regressor for the suburban data and print the results. pred_suburbs = svr(X_sub_train, y_sub_train, X_sub_test, features_general, cat_dummies_suburb, "suburbs", extra_features=True) print_statistics(features_suburb, None, pred_suburbs, y_sub_test, "suburbs", method) # generate_plots.plot_cumulative_errors(y_sub_test, pred_suburbs, 'suburbs') # Run the support vector regressor for the CBD data and print the results. pred_cbd = svr(X_cbd_train, y_cbd_train, X_cbd_test, features_general, cat_dummies_cbd, "CBD", extra_features=True) print_statistics(features_cbd, None, pred_cbd, y_cbd_test, "CBD", method) # generate_plots.plot_cumulative_errors(y_cbd_test, pred_cbd, 'CBD') else: print("Not a valid method.") return if store_results: name = method + "_all_split" height_vals_suburb = list( zip(data_suburb.loc[X_sub_test.index].id, pred_suburbs)) db_funcs.store_predictions(cursor, height_vals_suburb, table, name, 'suburbs') height_vals_cbd = list(zip(data_cbd.loc[X_cbd_test.index].id, pred_cbd)) db_funcs.store_predictions(cursor, height_vals_cbd, table, name, 'CBDs') # Negative: underestimation, positive: overestimation # Store relative error and the percentage error in the database. rel_errors_suburbs = (pred_suburbs - y_sub_test) perc_error_suburbs = ((pred_suburbs - y_sub_test) / y_sub_test) * 100 error_val_suburb = list( zip(data_suburb.loc[X_sub_test.index].id, rel_errors_suburbs, perc_error_suburbs)) db_funcs.store_errors(cursor, error_val_suburb, table, name, 'suburbs') rel_errors_cbd = (pred_cbd - y_cbd_test) perc_error_cbd = ((pred_cbd - y_cbd_test) / y_cbd_test) * 100 error_vals_cbd = list( zip(data_cbd.loc[X_cbd_test.index].id, rel_errors_cbd, perc_error_cbd)) db_funcs.store_errors(cursor, error_vals_cbd, table, name, 'CBD')
def test_all_features_single(data, cursor, table, store_results, method): """ Include both geometric and non-geometric features during the training and prediction process. Based on a single training network. """ # Create the dummy columns (one hot encoding) for the categorical data. cat_columns = ['bldg_type'] data_processed = pd.get_dummies(data, prefix_sep="__", columns=cat_columns) # Extract the names from the dummy columns for later use. cat_dummies = [col for col in data_processed if "__" in col \ and col.split("__")[0] in cat_columns] # Create list of features so we can extract the data from the dataframe. features_general = [ "area", "compactness", "num_neighbours", "num_adjacent_blds", "num_vertices", "length", "width", "slimness", "complexity", "cbd", "avg_hh_income", "avg_hh_size", "pop_density", "h_mean", "num_amenities" ] features_all = features_general + cat_dummies labels = ["rel_height"] X_train, X_test, y_train, y_test = train_test_split( data_processed[features_all], data_processed[labels], test_size=0.75, random_state=42) y_test = y_test.to_numpy().T[0] if method == "RFR": predictions, importances = randomforest(X_train, y_train, X_test, features_all, cat_dummies, "combined", extra_features=False) print_statistics(features_all, importances, predictions, y_test, "combined", method) # generate_plots.plot_cumulative_errors(y_test, predictions, 'combined') elif method == "MLR": predictions = mlr(X_train, y_train, X_test, features_all, cat_dummies, "combined", extra_features=False) print_statistics(features_all, None, predictions, y_test, "combined", method) # generate_plots.plot_cumulative_errors(y_test, predictions, 'combined') elif method == "SVR": predictions = svr(X_train, y_train, X_test, features_all, cat_dummies, "combined", extra_features=False) print_statistics(features_all, None, predictions, y_test, "combined", method) # generate_plots.plot_cumulative_errors(y_test, predictions, 'combined') else: print("Not a valid method.") return if store_results: name = method + "_geometric_single" height_vals = list(zip(data.loc[X_test.index].id, predictions)) db_funcs.store_predictions(cursor, height_vals, table, name, 'combined') # Negative: underestimation, positive: overestimation # Store relative error and the percentage error in the database. rel_errors = (predictions - y_test) perc_error = ((predictions - y_test) / y_test) * 100 error_val = list(zip(data.loc[X_test.index].id, rel_errors, perc_error)) db_funcs.store_errors(cursor, error_val, table, name, 'combined')
def test_geom_features_split(data_suburb, data_cbd, cursor, table, store_results, method): """ Only include geometric features during the training and prediction process. Based on a split training network of suburbs/ rural areas and CBDs. """ features = [ "area", "compactness", "num_neighbours", "num_adjacent_blds", "num_vertices", "length", "width", "slimness", "complexity" ] labels = ["rel_height"] dummies = [] # Split the data into a training and testing set. X_sub_train, X_sub_test, y_sub_train, y_sub_test = train_test_split( data_suburb[features], data_suburb[labels], test_size=0.75, random_state=42) X_cbd_train, X_cbd_test, y_cbd_train, y_cbd_test = train_test_split( data_cbd[features], data_cbd[labels], test_size=0.75, random_state=42) y_sub_test = y_sub_test.to_numpy().T[0] y_cbd_test = y_cbd_test.to_numpy().T[0] if method == "RFR": # Run the random forest regressor for the suburban data and print the results. pred_suburbs, imp_suburbs = randomforest(X_sub_train, y_sub_train, X_sub_test, features, dummies, "suburbs", extra_features=False) print_statistics(features, imp_suburbs, pred_suburbs, y_sub_test, "suburbs", method) # generate_plots.plot_cumulative_errors(y_sub_test, pred_suburbs, 'suburbs') # Run the random forest regressor for the CBD data and print the results. pred_cbd, imp_cbd = randomforest(X_cbd_train, y_cbd_train, X_cbd_test, features, dummies, "CBD", extra_features=False) print_statistics(features, imp_cbd, pred_cbd, y_cbd_test, "CBD", method) # generate_plots.plot_cumulative_errors(y_cbd_test, pred_cbd, 'CBD') elif method == "MLR": # Run the multiple linear regressor for the suburban data and print the results. pred_suburbs = mlr(X_sub_train, y_sub_train, X_sub_test, features, dummies, "suburbs", extra_features=False) print_statistics(features, None, pred_suburbs, y_sub_test, "suburbs", method) # generate_plots.plot_cumulative_errors(y_sub_test, pred_suburbs, 'suburbs') # Run the multiple linear regressor for the CBD data and print the results. pred_cbd = mlr(X_cbd_train, y_cbd_train, X_cbd_test, features, dummies, "CBD", extra_features=False) print_statistics(features, None, pred_cbd, y_cbd_test, "CBD", method) # generate_plots.plot_cumulative_errors(y_cbd_test, pred_cbd, 'CBD') elif method == "SVR": # Run the support vector regressor for the suburban data and print the results. pred_suburbs = svr(X_sub_train, y_sub_train, X_sub_test, features, dummies, "suburbs", extra_features=False) print_statistics(features, None, pred_suburbs, y_sub_test, "suburbs", method) # generate_plots.plot_cumulative_errors(y_sub_test, pred_suburbs, 'suburbs') # Run the support vector regressor for the CBD data and print the results. pred_cbd = svr(X_cbd_train, y_cbd_train, X_cbd_test, features, dummies, "CBD", extra_features=False) print_statistics(features, None, pred_cbd, y_cbd_test, "CBD", method) # generate_plots.plot_cumulative_errors(y_cbd_test, pred_cbd, 'CBD') else: print("Not a valid method.") return if store_results: name = method + "_geometric_split" height_vals_suburb = list( zip(data_suburb.loc[X_sub_test.index].id, pred_suburbs)) db_funcs.store_predictions(cursor, height_vals_suburb, table, name, 'suburbs') height_vals_cbd = list(zip(data_cbd.loc[X_cbd_test.index].id, pred_cbd)) db_funcs.store_predictions(cursor, height_vals_cbd, table, name, 'CBDs') # Negative: underestimation, positive: overestimation # Store relative error and the percentage error in the database. rel_errors_suburbs = (pred_suburbs - y_sub_test) perc_error_suburbs = ((pred_suburbs - y_sub_test) / y_sub_test) * 100 error_val_suburb = list( zip(data_suburb.loc[X_sub_test.index].id, rel_errors_suburbs, perc_error_suburbs)) db_funcs.store_errors(cursor, error_val_suburb, table, name, 'suburbs') rel_errors_cbd = (pred_cbd - y_cbd_test) perc_error_cbd = ((pred_cbd - y_cbd_test) / y_cbd_test) * 100 error_vals_cbd = list( zip(data_cbd.loc[X_cbd_test.index].id, rel_errors_cbd, perc_error_cbd)) db_funcs.store_errors(cursor, error_vals_cbd, table, name, 'CBD')