def collect_data(database, tables, train=False):
    """
    Based on the provided database and tables,
    retrieve data from the database for different settings:
    CBD, Suburban/Rural and a combination of these two.
    """

    connection = db_funcs.setup_connection(database)
    cursor = connection.cursor()

    # No specific tables were specified, so all tables in the
    # database are used for training.
    if not tables:
        tables = db_funcs.unique_tables(cursor)

    data_suburb, data_cbd, data_full = np.array([]), np.array([]), np.array([])

    # Extract all training data and store it into a pandas DataFrame.
    for i, table in enumerate(tables):
        if i == 0:
            data_suburb, data_cbd, data_full = db_funcs.read_data(connection, table,
                                                                  training=True)
        else:
            suburb, cbd, full = db_funcs.read_data(connection, table, training=train)
            data_suburb = data_suburb.append(suburb)
            data_cbd = data_cbd.append(cbd)
            data_full = data_full.append(full)

    db_funcs.close_connection(connection, cursor)

    return data_full, data_suburb, data_cbd
예제 #2
0
def get_data(database, tables):
    """
    Retrieve the data for the suburbs, CBDs and the combined dataset.
    """

    connection = db_funcs.setup_connection(database)
    cursor = connection.cursor()

    data_suburb, data_cbd, data_full = np.array([]), np.array([]), np.array([])

    for i, table in enumerate(tables):
        if i == 0:
            data_suburb, data_cbd, data_full = db_funcs.read_data(connection, table, training=True)
        else:
            suburb, cbd, full = db_funcs.read_data(connection, table, training=True)
            data_suburb = data_suburb.append(suburb)
            data_cbd = data_cbd.append(cbd)
            data_full = data_full.append(full)

    db_funcs.close_connection(connection, cursor)

    return data_full, data_suburb, data_cbd
예제 #3
0
def main():
    """
    Perform all function calls.
    """

    table = "denver_cutout"
    store_results = False
    method = "RFR"

    connection = db_funcs.setup_connection("denver")
    connection.autocommit = True
    cursor = connection.cursor()

    data_suburb, data_cbd, data_full = db_funcs.read_data(connection,
                                                          table,
                                                          extra_features=True,
                                                          training=True)

    correlation_matrix(data_suburb, 'suburbs')
    correlation_matrix(data_cbd, 'CBD')
    correlation_matrix(data_full, 'combined')

    violin_plot(data_suburb, 'suburbs')
    violin_plot(data_cbd, 'CBD')
    violin_plot(data_full, 'combined')

    print("\n>>> Running with only geometric features <<<")
    #test_geom_features_split(data_suburb, data_cbd, cursor, table, store_results, method)
    test_geom_features_single(data_full, cursor, table, store_results, method)

    print(80 * '-')

    print("\n>>> Running with geometric and non-geometric features <<<")
    #test_all_features_split(data_suburb, data_cbd, cursor, table, store_results, method)
    test_all_features_single(data_full, cursor, table, store_results, method)

    db_funcs.close_connection(connection, cursor)
def split_network(train_suburbs, train_cbds, test_db, tables_test, model_suburbs, scaler_suburbs,
                  model_cbds, scaler_cbds, method, save_predictions, labels, save_model,
                  test_subsets, feature_subset):
    """
    Perform the machine learning based on a split training network
    that separates the CBDs and Suburban and Rural areas.
    """

    reading_time = []
    predict_time = []

    if len(train_suburbs) != 0:
        train_feat_suburb, train_label_suburb = ml_funcs.get_features_and_labels(train_suburbs,
                                                                                 "split",
                                                                                 test_subsets,
                                                                                 feature_subset,
                                                                                 labels=True)

    if len(train_cbds) != 0:
        train_feat_cbd, train_label_cbd = ml_funcs.get_features_and_labels(train_cbds, "split",
                                                                           test_subsets,
                                                                           feature_subset,
                                                                           labels=True)

    # A database to perform the tests/ predictions on is specified.
    if test_db:
        connection = db_funcs.setup_connection(test_db)
        connection.autocommit = True
        cursor = connection.cursor()

        # If no specific tables are selected, perform predictions
        # for all tables in the specified testing database.
        if not tables_test:
            tables_test = db_funcs.unique_tables(cursor)

        for table in tables_test:
            if table == 'cbds':
                continue

            print(80*'-')
            print(80*'-')

            starttime = time()

            test_suburbs, test_cbds, _ = db_funcs.read_data(connection, table, training=labels)

            endtime = time()
            duration = endtime - starttime
            reading_time.append(duration)

            if labels:
                test_feat_suburbs, test_labels_suburbs = \
                  ml_funcs.get_features_and_labels(test_suburbs, "split", test_subsets,
                                                   feature_subset, labels=labels)
                test_feat_cbds, test_labels_cbds = \
                  ml_funcs.get_features_and_labels(test_cbds, "split", test_subsets, feature_subset,
                                                   labels=labels)
            else:
                test_feat_suburbs = \
                  ml_funcs.get_features_and_labels(test_suburbs, "split", test_subsets,
                                                   feature_subset, labels=labels)
                test_feat_cbds = ml_funcs.get_features_and_labels(test_cbds, "split", test_subsets,
                                                                  feature_subset, labels=labels)

            pred_cbds, pred_suburbs = np.array([]), np.array([])

            starttime = time()

            # There is no training data specified, use model.
            if len(train_suburbs) == 0 and len(train_cbds) == 0:
                if method == "RFR":

                    # There must be test features for the CBD present.
                    if len(test_feat_cbds) != 0:
                        pred_cbds, imp_cbds = predict_from_model(method, test_feat_cbds,
                                                                 model_cbds, scaler_cbds,
                                                                 'CBD')
                    else:
                        print("Warning: no CBD data present in test set {0}".format(table))

                    # There must be test features for the suburbs/rural areas present.
                    if len(test_feat_suburbs) != 0:
                        pred_suburbs, imp_suburbs = predict_from_model(method, test_feat_suburbs,
                                                                       model_suburbs,
                                                                       scaler_suburbs, 'suburbs')
                    else:
                        print("Warning: no rural/suburban data present in test set {0}"\
                              .format(table))

                else:
                    # There must be test features for the CBD present.
                    if len(test_feat_cbds) != 0:
                        pred_cbds = predict_from_model(method, test_feat_cbds, model_cbds,
                                                       scaler_cbds, 'CBD')
                    else:
                        print("Warning: no CBD data present in test set {0}".format(table))

                    # There must be test features for the suburbs/rural areas present.
                    if len(test_feat_suburbs) != 0:
                        pred_suburbs = predict_from_model(method, test_feat_suburbs,
                                                          model_suburbs, scaler_suburbs,
                                                          'suburbs')
                    else:
                        print("Warning: no rural/suburban data present in test set {0}"\
                              .format(table))

            # There is training data specified, check which area morphologies are present.
            else:
                if method == "RFR":
                    if len(train_suburbs) != 0 and len(test_feat_suburbs) != 0:
                        pred_suburbs, imp_suburbs = train_from_data(method, train_feat_suburb,
                                                                    train_label_suburb,
                                                                    test_feat_suburbs,
                                                                    save_model, 'suburbs')
                    else:
                        print("Warning: training and testing data do not both contain " +\
                              "suburban/rural data!")

                    if len(train_cbds) != 0 and len(test_feat_cbds) != 0:
                        pred_cbds, imp_cbds = train_from_data(method, train_feat_cbd,
                                                              train_label_cbd, test_feat_cbds,
                                                              save_model, 'CBD')
                    else:
                        print("Warning: training and testing data do not both contain CBD data!")

                else:
                    if len(train_suburbs) != 0 and len(test_feat_suburbs) != 0:
                        pred_suburbs = train_from_data(method, train_feat_suburb,
                                                       train_label_suburb,
                                                       test_feat_suburbs, save_model, 'suburbs')
                    else:
                        print("Warning: training and testing data do not both contain " +\
                              "suburban/rural data!")

                    if len(train_cbds) != 0 and len(test_feat_cbds) != 0:
                        pred_cbds = train_from_data(method, train_feat_cbd, train_label_cbd,
                                                    test_feat_cbds, save_model, 'CBD')
                    else:
                        print("Warning: training and testing data do not both contain CBD data!")

            endtime = time()
            duration = endtime - starttime
            predict_time.append(duration)

            # Labels are present: print statistics for the height predictions.
            if labels:
                if method == "RFR":
                    if len(pred_suburbs) != 0:
                        ml_funcs.get_statistics(test_labels_suburbs, pred_suburbs, "split",
                                                feature_subset, imp_suburbs)
                        generate_plots.plot_cumulative_errors(test_labels_suburbs, pred_suburbs,
                                                              'suburbs')
                    if len(pred_cbds) != 0:
                        ml_funcs.get_statistics(test_labels_cbds, pred_cbds, "split",
                                                feature_subset, imp_cbds)
                        generate_plots.plot_cumulative_errors(test_labels_cbds, pred_cbds, 'CBDs')
                else:
                    if len(pred_suburbs) != 0:
                        ml_funcs.get_statistics(test_labels_suburbs, pred_suburbs, "split",
                                                feature_subset)
                        generate_plots.plot_cumulative_errors(test_labels_suburbs, pred_suburbs,
                                                              'suburbs')
                    if len(pred_cbds) != 0:
                        ml_funcs.get_statistics(test_labels_cbds, pred_cbds, "split",
                                                feature_subset)
                        generate_plots.plot_cumulative_errors(test_labels_cbds, pred_cbds, 'CBD')

            # Store predictions in database.
            if save_predictions:
                if len(pred_suburbs) != 0:
                    height_values = list(zip(test_suburbs.id, pred_suburbs))
                    db_funcs.store_predictions(cursor, height_values, table, method, 'split')

                if len(pred_cbds) != 0:
                    height_values = list(zip(test_cbds.id, pred_cbds))
                    db_funcs.store_predictions(cursor, height_values, table, method, 'split')

        db_funcs.close_connection(connection, cursor)

        print("\n>> Total duration (s) of reading data " + \
              "into dataframes: {0} ({1})".format(sum(reading_time),
                                                  timedelta(seconds=sum(reading_time))))
        print("\n>> Total duration (s) of the building " + \
              " height predictions: {0} ({1})".format(sum(predict_time),
                                                      timedelta(seconds=sum(predict_time))))

    # No test database is specified, only train the model based on the training data.
    # Useful when training and storing a model to a file.
    else:
        if len(train_suburbs) != 0:
            train_from_data(method, train_feat_suburb, train_label_suburb, np.array([]),
                            save_model, 'suburbs')
        if len(train_cbds) != 0:
            train_from_data(method, train_feat_cbd, train_label_cbd, np.array([]),
                            save_model, 'CBD')
def single_network(train_data, test_db, tables_test, model, scaler, method,
                   save_predictions, labels, save_model, test_subsets, feature_subset):
    """
    Perform the machine learning based on a single training network
    that combines the CBDs and Suburban and Rural areas.
    """

    reading_time = []
    predict_time = []

    if len(train_data) != 0:
        train_features, train_labels = ml_funcs.get_features_and_labels(train_data, "single",
                                                                        test_subsets,
                                                                        feature_subset,
                                                                        labels=True)

    # A database to perform the tests/ predictions on is specified.
    if test_db:
        connection = db_funcs.setup_connection(test_db)
        connection.autocommit = True
        cursor = connection.cursor()

        # If no specific tables are selected, perform predictions
        # for all tables in the specified testing database.
        if not tables_test:
            tables_test = db_funcs.unique_tables(cursor)

        for table in tables_test:
            if table == 'cbds':
                continue

            print(80*'-')
            print(80*'-')

            starttime = time()

            _, _, test_data = db_funcs.read_data(connection, table, training=labels)

            endtime = time()
            duration = endtime - starttime
            reading_time.append(duration)

            if labels:
                test_features, test_labels = ml_funcs.get_features_and_labels(test_data, "single",
                                                                              test_subsets,
                                                                              feature_subset,
                                                                              labels=labels)
            else:
                test_features = ml_funcs.get_features_and_labels(test_data, "single", test_subsets,
                                                                 feature_subset, labels=labels)

            starttime = time()

            if len(train_data) == 0:
                if method == "RFR":
                    predictions, importances = predict_from_model(method, test_features,
                                                                  model, scaler, 'combined')
                else:
                    predictions = predict_from_model(method, test_features, model, scaler,
                                                     'combined')
            else:
                if method == "RFR":
                    predictions, importances = train_from_data(method, train_features,
                                                               train_labels, test_features,
                                                               save_model, 'combined')
                else:
                    predictions = train_from_data(method, train_features, train_labels,
                                                  test_features, save_model, 'combined')

            endtime = time()
            duration = endtime - starttime
            predict_time.append(duration)

            # Labels are present: print statistics for the height predictions.
            if labels:
                if method == "RFR":
                    ml_funcs.get_statistics(test_labels, predictions, "single", feature_subset,
                                            importances)
                else:
                    ml_funcs.get_statistics(test_labels, predictions, "single", feature_subset)
                generate_plots.plot_cumulative_errors(test_labels, predictions, 'combined',)

            # Store predictions in database.
            if save_predictions:
                height_values = list(zip(test_data.id, predictions))
                db_funcs.store_predictions(cursor, height_values, table, method, 'combined')

        db_funcs.close_connection(connection, cursor)

        print("\n>> Total duration (s) of reading data " + \
              "into dataframes: {0} ({1})".format(sum(reading_time),
                                                  timedelta(seconds=sum(reading_time))))
        print("\n>> Total duration (s) of the building " + \
              " height predictions: {0} ({1})".format(sum(predict_time),
                                                      timedelta(seconds=sum(predict_time))))

    # No test database is specified, only train the model based on the training data.
    # Useful when training and storing a model to a file.
    else:
        if len(train_features) != 0:
            train_from_data(method, train_features, train_labels, np.array([]),
                            save_model, 'combined')