示例#1
0
def create_admin_bdys_for_analysis(settings):
    # Step 3 of 3 : create admin bdy tables optimised for spatial analysis
    start_time = datetime.now()

    if settings['st_subdivide_supported']:
        template_sql = psma.open_sql_file(
            "02-03-create-admin-bdy-analysis-tables_template.sql", settings)
        sql_list = list()

        for table in settings['admin_bdy_list']:
            sql = template_sql.format(table[0], table[1])
            if table[
                    0] == 'locality_bdys':  # special case, need to change schema name
                # sql = sql.replace(settings['raw_admin_bdys_schema'], settings['admin_bdys_schema'])
                sql = sql.replace("name", "locality_name")
                # add postcodes
                sql = sql.replace(
                    "locality_name text NOT NULL,",
                    "locality_name text NOT NULL, postcode text NULL,")
                sql = sql.replace("locality_name,", "locality_name, postcode,")

            sql_list.append(sql)
        psma.multiprocess_list("sql", sql_list, settings, logger)
        logger.info(
            "\t- Step 3 of 3 : admin boundaries for analysis created : {0}".
            format(datetime.now() - start_time))
    else:
        logger.warning(
            "\t- Step 3 of 3 : admin boundaries for analysis NOT created - "
            "requires PostGIS 2.2+ with GEOS 3.5.0+")
示例#2
0
def get_split_localities(pg_cur, settings):
    start_time = datetime.now()
    sql = psma.open_sql_file("02-split-localities-by-state-borders.sql", settings)
    sql_list = psma.split_sql_into_list(pg_cur, sql, settings['admin_bdys_schema'], "temp_localities", "loc", "gid",
                                        settings, logger)
    psma.multiprocess_list("sql", sql_list, settings, logger)
    logger.info("\t- Step 2 of 7 : localities split by state : {0}".format(datetime.now() - start_time))
示例#3
0
def get_locality_state_border_gaps(pg_cur, settings):
    start_time = datetime.now()
    sql = psma.open_sql_file("04-create-holes-along-borders.sql", settings)
    sql_list = psma.split_sql_into_list(pg_cur, sql, settings['admin_bdys_schema'],
                                        "temp_state_border_buffers_subdivided", "ste", "new_gid", settings, logger)
    psma.multiprocess_list("sql", sql_list, settings, logger)
    logger.info("\t- Step 4 of 7 : locality holes created : {0}".format(datetime.now() - start_time))
def create_primary_foreign_keys(settings):
    start_time = datetime.now()

    key_sql = open(
        os.path.join(settings['sql_dir'],
                     "01-06-raw-gnaf-create-primary-foreign-keys.sql"),
        "r").read()
    key_sql_list = key_sql.split("--")
    sql_list = []

    for sql in key_sql_list:
        sql = sql.strip()
        if sql[0:6] == "ALTER ":
            # add schema to tables names, in case raw gnaf schema not the default
            sql = sql.replace(
                "ALTER TABLE ONLY ",
                "ALTER TABLE ONLY " + settings['raw_gnaf_schema'] + ".")
            sql_list.append(sql)

    sql_list = []

    # run queries in separate processes
    psma.multiprocess_list("sql", sql_list, settings, logger)

    logger.info(
        "\t- Step 6 of 7 : primary & foreign keys created : {0}".format(
            datetime.now() - start_time))
示例#5
0
def get_locality_state_border_gaps(pg_cur, settings):
    start_time = datetime.now()
    sql = psma.open_sql_file("04-create-holes-along-borders.sql", settings)
    sql_list = psma.split_sql_into_list(
        pg_cur, sql, settings['admin_bdys_schema'],
        "temp_state_border_buffers_subdivided", "ste", "new_gid", settings,
        logger)
    psma.multiprocess_list("sql", sql_list, settings, logger)
    logger.info("\t- Step 4 of 7 : locality holes created : {0}".format(
        datetime.now() - start_time))
示例#6
0
def create_states_and_prep_localities(settings):
    start_time = datetime.now()
    sql_list = [
        psma.open_sql_file("01a-create-states-from-sa4s.sql", settings),
        psma.open_sql_file("01b-prep-locality-boundaries.sql", settings)
    ]
    psma.multiprocess_list("sql", sql_list, settings, logger)
    logger.info(
        "\t- Step 1 of 7 : state table created & localities prepped : {0}".
        format(datetime.now() - start_time))
示例#7
0
def get_split_localities(pg_cur, settings):
    start_time = datetime.now()
    sql = psma.open_sql_file("02-split-localities-by-state-borders.sql",
                             settings)
    sql_list = psma.split_sql_into_list(pg_cur, sql,
                                        settings['admin_bdys_schema'],
                                        "temp_localities", "loc", "gid",
                                        settings, logger)
    psma.multiprocess_list("sql", sql_list, settings, logger)
    logger.info("\t- Step 2 of 7 : localities split by state : {0}".format(
        datetime.now() - start_time))
示例#8
0
def index_raw_gnaf(settings):
    # Step 5 of 7 : create indexes
    start_time = datetime.now()

    raw_sql_list = psma.open_sql_file("01-05-raw-gnaf-create-indexes.sql", settings).split("\n")
    sql_list = []
    for sql in raw_sql_list:
        if sql[0:2] != "--" and sql[0:2] != "":
            sql_list.append(sql)

    psma.multiprocess_list("sql", sql_list, settings, logger)
    logger.info("\t- Step 5 of 7 : indexes created: {0}".format(datetime.now() - start_time))
def prep_admin_bdys(pg_cur, settings):
    # Step 2 of 3 : create admin bdy tables read to be used
    start_time = datetime.now()

    if settings['admin_bdys_schema'] != "public":
        pg_cur.execute(
            "CREATE SCHEMA IF NOT EXISTS {0} AUTHORIZATION {1}".format(
                settings['admin_bdys_schema'], settings['pg_user']))

    # create tables using multiprocessing - using flag in file to split file up into sets of statements
    sql_list = psma.open_sql_file("02-02a-prep-admin-bdys-tables.sql",
                                  settings).split("-- # --")
    sql_list = sql_list + psma.open_sql_file(
        "02-02b-prep-census-2011-bdys-tables.sql", settings).split("-- # --")
    sql_list = sql_list + psma.open_sql_file(
        "02-02c-prep-census-2016-bdys-tables.sql", settings).split("-- # --")

    # # Account for bdys that are not in states to load - not yet working
    # for sql in sql_list:
    #     if settings['states_to_load'] == ['OT'] and '.commonwealth_electorates ' in sql:
    #         sql_list.remove(sql)
    #
    #     if settings['states_to_load'] == ['ACT'] and '.local_government_areas ' in sql:
    #         sql_list.remove(sql)
    #
    #     logger.info(settings['states_to_load']
    #
    #     if not ('NT' in settings['states_to_load'] or 'SA' in settings['states_to_load']
    #             or 'VIC' in settings['states_to_load'] or 'WA' in settings['states_to_load']) \
    #             and '.local_government_wards ' in sql:
    #         sql_list.remove(sql)
    #
    #     if settings['states_to_load'] == ['OT'] and '.state_lower_house_electorates ' in sql:
    #         sql_list.remove(sql)
    #
    #     if not ('TAS' in settings['states_to_load'] or 'VIC' in settings['states_to_load']
    #             or 'WA' in settings['states_to_load']) and '.state_upper_house_electorates ' in sql:
    #         sql_list.remove(sql)

    psma.multiprocess_list("sql", sql_list, settings, logger)

    # Special case - remove custom outback bdy if South Australia not requested
    if 'SA' not in settings['states_to_load']:
        pg_cur.execute(
            psma.prep_sql(
                "DELETE FROM admin_bdys.locality_bdys WHERE locality_pid = 'SA999999'",
                settings))
        pg_cur.execute(
            psma.prep_sql("VACUUM ANALYZE admin_bdys.locality_bdys", settings))

    logger.info("\t- Step 2 of 3 : admin boundaries prepped : {0}".format(
        datetime.now() - start_time))
示例#10
0
def analyse_raw_gnaf_tables(pg_cur, settings):
    start_time = datetime.now()
    
    # get list of tables that haven't been analysed (i.e. that have no real row count)
    sql = "SELECT nspname|| '.' || relname AS table_name " \
          "FROM pg_class C LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)" \
          "WHERE nspname = '{0}' AND relkind='r' AND reltuples = 0".format(settings['raw_gnaf_schema'])
    pg_cur.execute(sql)

    sql_list = []

    for pg_row in pg_cur:
        sql_list.append("ANALYZE {0}".format(pg_row[0]))

    # run queries in separate processes
    psma.multiprocess_list("sql", sql_list, settings, logger)

    logger.info("\t- Step 7 of 7 : tables analysed : {0}".format(datetime.now() - start_time))
示例#11
0
def populate_raw_gnaf(settings):
    # Step 4 of 7 : load raw gnaf authority & state tables
    start_time = datetime.now()

    # authority code file list
    sql_list = get_raw_gnaf_files("authority_code", settings)

    # add state file lists
    for state in settings['states_to_load']:
        logger.info("\t\t- Loading state {}".format(state))
        sql_list.extend(get_raw_gnaf_files(state, settings))

    # are there any files to load?
    if len(sql_list) == 0:
        logger.fatal("No raw GNAF PSV files found\nACTION: Check your 'gnaf_network_directory' path")
        logger.fatal("\t- Step 4 of 7 : table populate FAILED!")
    else:
        # load all PSV files using multiprocessing
        psma.multiprocess_list("sql", sql_list, settings, logger)
        logger.info("\t- Step 4 of 7 : tables populated : {0}".format(datetime.now() - start_time))
示例#12
0
def boundary_tag_gnaf(pg_cur, settings):

    # create bdy table list
    # remove localities, postcodes and states as these IDs are already assigned to GNAF addresses
    table_list = list()
    for table in settings['admin_bdy_list']:
        if table[0] not in ["locality_bdys", "postcode_bdys", "state_bdys"]:
            # if no analysis tables created - use the full tables instead of the subdivided ones
            # WARNING: this can add hours to the processing
            if settings['st_subdivide_supported']:
                table_name = "{}_analysis".format(table[0], )
            else:
                table_name = table[0]

            table_list.append([table_name, table[1]])

    # create bdy tagged address tables
    for address_table in ["address_principal", "address_alias"]:
        pg_cur.execute(
            "DROP TABLE IF EXISTS {}.{}_admin_boundaries CASCADE".format(
                settings['gnaf_schema'], address_table))
        create_table_list = list()
        create_table_list.append(
            "CREATE TABLE {}.{}_admin_boundaries (gid serial NOT NULL,"
            "gnaf_pid text NOT NULL,"
            # "alias_principal character(1) NOT NULL,"
            "locality_pid text NOT NULL,"
            "locality_name text NOT NULL,"
            "postcode text,"
            "state text NOT NULL".format(settings['gnaf_schema'],
                                         address_table))

        for table in table_list:
            pid_field = table[1]
            name_field = pid_field.replace("_pid", "_name")
            create_table_list.append(", {} text, {} text".format(
                pid_field, name_field))
        create_table_list.append(
            ") WITH (OIDS=FALSE);ALTER TABLE {}.{}_admin_boundaries OWNER TO {}"
            .format(settings['gnaf_schema'], address_table,
                    settings['pg_user']))
        pg_cur.execute("".join(create_table_list))

    # Step 1 of 6 : tag gnaf addresses with admin boundary IDs, using multiprocessing
    start_time = datetime.now()

    # create temp tables
    template_sql = psma.open_sql_file(
        "04-01a-bdy-tag-create-table-template.sql", settings)
    for table in table_list:
        pg_cur.execute(template_sql.format(table[0], ))

    # create temp tables of bdy tagged gnaf_pids
    template_sql = psma.open_sql_file("04-01b-bdy-tag-template.sql", settings)
    sql_list = list()
    for table in table_list:
        sql = template_sql.format(table[0], table[1])

        short_sql_list = psma.split_sql_into_list(
            pg_cur, sql, settings['admin_bdys_schema'], table[0], "bdys",
            "gid", settings, logger)

        if short_sql_list is not None:
            sql_list.extend(short_sql_list)

    # logger.info('\n'.join(sql_list))

    if sql_list is not None:
        psma.multiprocess_list("sql", sql_list, settings, logger)

    logger.info(
        "\t- Step 1 of 6 : principal addresses tagged with admin boundary IDs: {}"
        .format(datetime.now() - start_time, ))
    start_time = datetime.now()

    # Step 2 of 6 : delete invalid matches, create indexes and analyse tables
    sql_list = list()
    for table in table_list:
        sql = "DELETE FROM {0}.temp_{1}_tags WHERE gnaf_state <> bdy_state AND gnaf_state <> 'OT';" \
              "CREATE INDEX temp_{1}_tags_gnaf_pid_idx ON {0}.temp_{1}_tags USING btree(gnaf_pid);" \
              "ANALYZE {0}.temp_{1}_tags".format(settings['gnaf_schema'], table[0])
        sql_list.append(sql)
    psma.multiprocess_list("sql", sql_list, settings, logger)

    logger.info(
        "\t- Step 2 of 6 : principal addresses - invalid matches deleted & bdy tag indexes created : {}"
        .format(datetime.now() - start_time, ))
    start_time = datetime.now()

    # Step 3 of 6 : insert boundary tagged addresses

    # create insert statement for multiprocessing
    insert_field_list = list()
    insert_field_list.append(
        "(gnaf_pid, locality_pid, locality_name, postcode, state")

    insert_join_list = list()
    insert_join_list.append("FROM {}.address_principals AS pnts ".format(
        settings['gnaf_schema'], ))

    select_field_list = list()
    select_field_list.append("SELECT pnts.gnaf_pid, pnts.locality_pid, "
                             "pnts.locality_name, pnts.postcode, pnts.state")

    drop_table_list = list()

    for table in table_list:
        pid_field = table[1]
        name_field = pid_field.replace("_pid", "_name")
        insert_field_list.append(", {0}, {1}".format(pid_field, name_field))
        select_field_list.append(
            ", temp_{0}_tags.bdy_pid, temp_{0}_tags.bdy_name ".format(
                table[0]))
        insert_join_list.append(
            "LEFT OUTER JOIN {0}.temp_{1}_tags ON pnts.gnaf_pid = temp_{1}_tags.gnaf_pid "
            .format(settings['gnaf_schema'], table[0]))
        drop_table_list.append(
            "DROP TABLE IF EXISTS {0}.temp_{1}_tags;".format(
                settings['gnaf_schema'], table[0]))

    insert_field_list.append(") ")

    insert_statement_list = list()
    insert_statement_list.append(
        "INSERT INTO {0}.address_principal_admin_boundaries ".format(
            settings['gnaf_schema'], ))
    insert_statement_list.append("".join(insert_field_list))
    insert_statement_list.append("".join(select_field_list))
    insert_statement_list.append("".join(insert_join_list))

    sql = "".join(insert_statement_list) + ";"
    sql_list = psma.split_sql_into_list(pg_cur, sql, settings['gnaf_schema'],
                                        "address_principals", "pnts", "gid",
                                        settings, logger)
    # logger.info("\n".join(sql_list)

    if sql_list is not None:
        psma.multiprocess_list("sql", sql_list, settings, logger)

    # drop temp tables
    pg_cur.execute("".join(drop_table_list))

    # get stats
    pg_cur.execute("ANALYZE {0}.address_principal_admin_boundaries ".format(
        settings['gnaf_schema']))

    logger.info(
        "\t- Step 3 of 6 : principal addresses - bdy tags added to output table : {}"
        .format(datetime.now() - start_time, ))

    start_time = datetime.now()

    # Step 4 of 6 : add index to output table
    sql = "CREATE INDEX address_principal_admin_boundaries_gnaf_pid_idx " \
          "ON {0}.address_principal_admin_boundaries USING btree (gnaf_pid)"\
        .format(settings['gnaf_schema'])
    pg_cur.execute(sql)

    logger.info(
        "\t- Step 4 of 6 : created index on bdy tagged address table : {}".
        format(datetime.now() - start_time, ))
    start_time = datetime.now()

    # Step 5 of 6 : log duplicates - happens when 2 boundaries overlap by a very small amount
    # (can be ignored if there's a small number of records affected)
    sql = "SELECT gnaf_pid FROM (SELECT Count(*) AS cnt, gnaf_pid FROM {0}.address_principal_admin_boundaries " \
          "GROUP BY gnaf_pid) AS sqt WHERE cnt > 1".format(settings['gnaf_schema'])
    pg_cur.execute(sql)

    # get cursor description to test if any rows returned safely
    columns = pg_cur.description

    # log gnaf_pids that got duplicate results
    if columns is not None:
        duplicates = pg_cur.fetchall()
        gnaf_pids = list()

        for duplicate in duplicates:
            gnaf_pids.append("\t\t" + duplicate[0])

        if len(gnaf_pids) > 0:
            logger.warning(
                "\t- Step 5 of 6 : found boundary tag duplicates : {}".format(
                    datetime.now() - start_time, ))
            logger.warning("\n".join(gnaf_pids))
        else:
            logger.info(
                "\t- Step 5 of 6 : no boundary tag duplicates : {}".format(
                    datetime.now() - start_time, ))
    else:
        logger.info("\t- Step 5 of 6 : no boundary tag duplicates : {}".format(
            datetime.now() - start_time, ))

    # Step 6 of 6 : Copy principal boundary tags to alias addresses
    pg_cur.execute(
        psma.open_sql_file("04-06-bdy-tags-for-alias-addresses.sql", settings))
    logger.info(
        "\t- Step 6 of 6 : alias addresses boundary tagged : {}".format(
            datetime.now() - start_time, ))

    # Step 7 of 7 : Create view of all bdy tags
    pg_cur.execute(
        psma.open_sql_file("04-07-create-bdy-tag-view.sql", settings))
    logger.info(
        "\t- Step 6 of 6 : boundary tagged address view created : {}".format(
            datetime.now() - start_time, ))
示例#13
0
def create_reference_tables(pg_cur, settings):
    # set postgres search path back to the default
    pg_cur.execute("SET search_path = public, pg_catalog")

    # Step 1 of 14 : create reference tables
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-01-reference-create-tables.sql", settings))
    logger.info("\t- Step  1 of 14 : create reference tables : {0}".format(
        datetime.now() - start_time))

    # Step 2 of 14 : populate localities
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-02-reference-populate-localities.sql",
                           settings))
    logger.info("\t- Step  2 of 14 : localities populated : {0}".format(
        datetime.now() - start_time))

    # Step 3 of 14 : populate locality aliases
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-03-reference-populate-locality-aliases.sql",
                           settings))
    logger.info("\t- Step  3 of 14 : locality aliases populated : {0}".format(
        datetime.now() - start_time))

    # Step 4 of 14 : populate locality neighbours
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-04-reference-populate-locality-neighbours.sql",
                           settings))
    logger.info(
        "\t- Step  4 of 14 : locality neighbours populated : {0}".format(
            datetime.now() - start_time))

    # Step 5 of 14 : populate streets
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-05-reference-populate-streets.sql", settings))
    logger.info(
        "\t- Step  5 of 14 : streets populated : {0}".format(datetime.now() -
                                                             start_time))

    # Step 6 of 14 : populate street aliases
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-06-reference-populate-street-aliases.sql",
                           settings))
    logger.info("\t- Step  6 of 14 : street aliases populated : {0}".format(
        datetime.now() - start_time))

    # Step 7 of 14 : populate addresses, using multiprocessing
    start_time = datetime.now()
    sql = psma.open_sql_file("03-07-reference-populate-addresses-1.sql",
                             settings)
    sql_list = psma.split_sql_into_list(pg_cur, sql, settings['gnaf_schema'],
                                        "streets", "str", "gid", settings,
                                        logger)
    if sql_list is not None:
        psma.multiprocess_list('sql', sql_list, settings, logger)
    pg_cur.execute(psma.prep_sql("ANALYZE gnaf.temp_addresses;", settings))
    logger.info(
        "\t- Step  7 of 14 : addresses populated : {0}".format(datetime.now() -
                                                               start_time))

    # Step 8 of 14 : populate principal alias lookup
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-08-reference-populate-address-alias-lookup.sql",
                           settings))
    logger.info(
        "\t- Step  8 of 14 : principal alias lookup populated : {0}".format(
            datetime.now() - start_time))

    # Step 9 of 14 : populate primary secondary lookup
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file(
            "03-09-reference-populate-address-secondary-lookup.sql", settings))
    pg_cur.execute(
        psma.prep_sql("VACUUM ANALYSE gnaf.address_secondary_lookup",
                      settings))
    logger.info(
        "\t- Step  9 of 14 : primary secondary lookup populated : {0}".format(
            datetime.now() - start_time))

    # Step 10 of 14 : split the Melbourne locality into its 2 postcodes (3000, 3004)
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-10-reference-split-melbourne.sql", settings))
    logger.info(
        "\t- Step 10 of 14 : Melbourne split : {0}".format(datetime.now() -
                                                           start_time))

    # Step 11 of 14 : finalise localities assigned to streets and addresses
    start_time = datetime.now()
    pg_cur.execute(
        psma.open_sql_file("03-11-reference-finalise-localities.sql",
                           settings))
    logger.info("\t- Step 11 of 14 : localities finalised : {0}".format(
        datetime.now() - start_time))

    # Step 12 of 14 : finalise addresses, using multiprocessing
    start_time = datetime.now()
    sql = psma.open_sql_file("03-12-reference-populate-addresses-2.sql",
                             settings)
    sql_list = psma.split_sql_into_list(pg_cur, sql, settings['gnaf_schema'],
                                        "localities", "loc", "gid", settings,
                                        logger)
    if sql_list is not None:
        psma.multiprocess_list('sql', sql_list, settings, logger)

    # turf the temp address table
    pg_cur.execute(
        psma.prep_sql("DROP TABLE IF EXISTS gnaf.temp_addresses", settings))
    logger.info(
        "\t- Step 12 of 14 : addresses finalised : {0}".format(datetime.now() -
                                                               start_time))

    # Step 13 of 14 : create almost correct postcode boundaries by aggregating localities, using multiprocessing
    start_time = datetime.now()
    sql = psma.open_sql_file("03-13-reference-derived-postcode-bdys.sql",
                             settings)
    sql_list = []
    for state in settings['states_to_load']:
        state_sql = sql.replace("GROUP BY ",
                                "WHERE state = '{0}' GROUP BY ".format(state))
        sql_list.append(state_sql)
    psma.multiprocess_list("sql", sql_list, settings, logger)

    # create analysis table?
    if settings['st_subdivide_supported']:
        pg_cur.execute(
            psma.open_sql_file("03-13a-create-postcode-analysis-table.sql",
                               settings))

    logger.info("\t- Step 13 of 14 : postcode boundaries created : {0}".format(
        datetime.now() - start_time))

    # Step 14 of 14 : create indexes, primary and foreign keys, using multiprocessing
    start_time = datetime.now()
    raw_sql_list = psma.open_sql_file("03-14-reference-create-indexes.sql",
                                      settings).split("\n")
    sql_list = []
    for sql in raw_sql_list:
        if sql[0:2] != "--" and sql[0:2] != "":
            sql_list.append(sql)
    psma.multiprocess_list("sql", sql_list, settings, logger)
    logger.info(
        "\t- Step 14 of 14 : create primary & foreign keys and indexes : {0}".
        format(datetime.now() - start_time))
示例#14
0
def create_states_and_prep_localities(settings):
    start_time = datetime.now()
    sql_list = [psma.open_sql_file("01a-create-states-from-sa4s.sql", settings),
                psma.open_sql_file("01b-prep-locality-boundaries.sql", settings)]
    psma.multiprocess_list("sql", sql_list, settings, logger)
    logger.info("\t- Step 1 of 7 : state table created & localities prepped : {0}".format(datetime.now() - start_time))
示例#15
0
def boundary_tag_gnaf(pg_cur, settings):

    # create bdy table list
    # remove localities, postcodes and states as these IDs are already assigned to GNAF addresses
    table_list = list()
    for table in settings['admin_bdy_list']:
        if table[0] not in ["locality_bdys", "postcode_bdys", "state_bdys"]:
            # if no analysis tables created - use the full tables instead of the subdivided ones
            # WARNING: this can add hours to the processing
            if settings['st_subdivide_supported']:
                table_name = "{0}_analysis".format(table[0], )
            else:
                table_name = table[0]

            table_list.append([table_name, table[1]])

    # create bdy tagged address table
    pg_cur.execute(
        "DROP TABLE IF EXISTS {0}.address_admin_boundaries CASCADE".format(
            settings['gnaf_schema'], ))
    create_table_list = list()
    create_table_list.append(
        "CREATE TABLE {0}.address_admin_boundaries (gid serial NOT NULL,"
        "gnaf_pid character varying(16) NOT NULL,"
        "alias_principal character(1) NOT NULL,"
        "locality_pid character varying(16) NOT NULL,"
        "locality_name character varying(100) NOT NULL,"
        "postcode character varying(4),"
        "state character varying(3) NOT NULL".format(
            settings['gnaf_schema'], ))
    for table in table_list:
        pid_field = table[1]
        name_field = pid_field.replace("_pid", "_name")
        create_table_list.append(
            ", {0} character varying(15), {1} character varying(100)".format(
                pid_field, name_field))
    create_table_list.append(
        ") WITH (OIDS=FALSE);ALTER TABLE {0}.address_admin_boundaries OWNER TO {1}"
        .format(settings['gnaf_schema'], settings['pg_user']))
    pg_cur.execute("".join(create_table_list))

    i = 0

    for address_table in ["address_principals", "address_aliases"]:

        # Step 1/4 of 8 : tag gnaf addresses with admin boundary IDs, using multiprocessing
        start_time = datetime.now()

        # create temp tables
        template_sql = psma.open_sql_file(
            "04-01a-bdy-tag-create-table-template.sql", settings)
        for table in table_list:
            pg_cur.execute(template_sql.format(table[0], ))

        # create temp tables of bdy tagged gnaf_pids
        template_sql = psma.open_sql_file("04-01b-bdy-tag-template.sql",
                                          settings)
        sql_list = list()
        for table in table_list:
            sql = template_sql.format(table[0], table[1])

            short_sql_list = psma.split_sql_into_list(
                pg_cur, sql, settings['admin_bdys_schema'], table[0], "bdys",
                "gid", settings, logger)

            if short_sql_list is not None:
                sql_list.extend(short_sql_list)

        # logger.info('\n'.join(sql_list))

        if sql_list is not None:
            psma.multiprocess_list("sql", sql_list, settings, logger)

        i += 1
        logger.info(
            "\t- Step {0} of 8 : {1} - gnaf addresses tagged with admin boundary IDs: {2}"
            .format(i, address_table,
                    datetime.now() - start_time))
        start_time = datetime.now()

        # Step 2/5 of 8 : delete invalid matches, create indexes and analyse tables
        sql_list = list()
        for table in table_list:
            sql = "DELETE FROM {0}.temp_{1}_tags WHERE gnaf_state <> bdy_state AND gnaf_state <> 'OT';" \
                  "CREATE INDEX temp_{1}_tags_gnaf_pid_idx ON {0}.temp_{1}_tags USING btree(gnaf_pid);" \
                  "ANALYZE {0}.temp_{1}_tags".format(settings['gnaf_schema'], table[0])
            sql_list.append(sql)
        psma.multiprocess_list("sql", sql_list, settings, logger)

        i += 1
        logger.info(
            "\t- Step {0} of 8 : {1} - invalid matches deleted & bdy tag indexes created : {2}"
            .format(i, address_table,
                    datetime.now() - start_time))
        start_time = datetime.now()

        # Step 3/6 of 8 : insert boundary tagged addresses

        # create insert statement for multiprocessing
        insert_field_list = list()
        insert_field_list.append(
            "(gnaf_pid, alias_principal, locality_pid, locality_name, postcode, state"
        )

        insert_join_list = list()
        insert_join_list.append("FROM {0}.{1} AS pnts ".format(
            settings['gnaf_schema'], address_table))

        select_field_list = list()
        select_field_list.append(
            "SELECT pnts.gnaf_pid, pnts.alias_principal, pnts.locality_pid, "
            "pnts.locality_name, pnts.postcode, pnts.state")

        drop_table_list = list()

        for table in table_list:
            pid_field = table[1]
            name_field = pid_field.replace("_pid", "_name")
            insert_field_list.append(", {0}, {1}".format(
                pid_field, name_field))
            select_field_list.append(
                ", temp_{0}_tags.bdy_pid, temp_{0}_tags.bdy_name ".format(
                    table[0]))
            insert_join_list.append(
                "LEFT OUTER JOIN {0}.temp_{1}_tags ON pnts.gnaf_pid = temp_{1}_tags.gnaf_pid "
                .format(settings['gnaf_schema'], table[0]))
            drop_table_list.append(
                "DROP TABLE IF EXISTS {0}.temp_{1}_tags;".format(
                    settings['gnaf_schema'], table[0]))

        insert_field_list.append(") ")

        insert_statement_list = list()
        insert_statement_list.append(
            "INSERT INTO {0}.address_admin_boundaries ".format(
                settings['gnaf_schema'], ))
        insert_statement_list.append("".join(insert_field_list))
        insert_statement_list.append("".join(select_field_list))
        insert_statement_list.append("".join(insert_join_list))

        sql = "".join(insert_statement_list) + ";"
        sql_list = psma.split_sql_into_list(pg_cur, sql,
                                            settings['gnaf_schema'],
                                            address_table, "pnts", "gid",
                                            settings, logger)
        # logger.info("\n".join(sql_list)

        if sql_list is not None:
            psma.multiprocess_list("sql", sql_list, settings, logger)

        # drop temp tables
        pg_cur.execute("".join(drop_table_list))

        # get stats
        pg_cur.execute("ANALYZE {0}.address_admin_boundaries ".format(
            settings['gnaf_schema']))

        i += 1
        logger.info(
            "\t- Step {0} of 8 : {1} - bdy tags added to output table : {2}".
            format(i, address_table,
                   datetime.now() - start_time))

    start_time = datetime.now()

    # Step 7 of 8 : add index to output table
    sql = "CREATE INDEX address_admin_boundaries_gnaf_pid_idx ON {0}.address_admin_boundaries USING btree (gnaf_pid)"\
        .format(settings['gnaf_schema'])
    pg_cur.execute(sql)

    i += 1
    logger.info(
        "\t- Step {0} of 8 : created index on bdy tagged address table : {1}".
        format(i,
               datetime.now() - start_time))
    start_time = datetime.now()

    # Step 8 of 8 : log duplicates - happens when 2 boundaries overlap by a very small amount
    # (can be ignored if there's a small number of records affected)
    sql = "SELECT gnaf_pid FROM (SELECT Count(*) AS cnt, gnaf_pid FROM {0}.address_admin_boundaries " \
          "GROUP BY gnaf_pid) AS sqt WHERE cnt > 1".format(settings['gnaf_schema'])
    pg_cur.execute(sql)

    i += 1

    try:
        duplicates = pg_cur.fetchall()
        gnaf_pids = list()

        for duplicate in duplicates:
            gnaf_pids.append("\t\t" + duplicate[0])

        logger.warning(
            "\t- Step {0} of 8 : found boundary tag duplicates : {1}".format(
                i,
                datetime.now() - start_time))
        logger.warning("\n".join(gnaf_pids))
    except psycopg2.Error:
        logger.info(
            "\t- Step {0} of 8 : no boundary tag duplicates : {1}".format(
                i,
                datetime.now() - start_time))
示例#16
0
def load_raw_admin_boundaries(pg_cur, settings):
    start_time = datetime.now()

    # drop existing views
    pg_cur.execute(
        psma.open_sql_file("02-01-drop-admin-bdy-views.sql", settings))

    # add locality class authority code table
    settings['states_to_load'].extend(["authority_code"])

    # create schema
    if settings['raw_admin_bdys_schema'] != "public":
        pg_cur.execute(
            "CREATE SCHEMA IF NOT EXISTS {0} AUTHORIZATION {1}".format(
                settings['raw_admin_bdys_schema'], settings['pg_user']))

    # set psql connect string and password
    psql_str = "psql -U {0} -d {1} -h {2} -p {3}"\
        .format(settings['pg_user'], settings['pg_db'], settings['pg_host'], settings['pg_port'])

    password_str = ''
    if not os.getenv("PGPASSWORD"):
        if platform.system() == "Windows":
            password_str = "SET"
        else:
            password_str = "export"

        password_str += " PGPASSWORD={0}&&".format(settings['pg_password'])

    # get file list
    table_list = []
    cmd_list1 = []
    cmd_list2 = []

    for state in settings['states_to_load']:
        state = state.lower()
        # get a dictionary of Shapefiles and DBFs matching the state
        for root, dirs, files in os.walk(
                settings['admin_bdys_local_directory']):
            for file_name in files:
                if file_name.lower().startswith(state + "_"):
                    if file_name.lower().endswith("_shp.dbf"):
                        # change file type for spatial files
                        if file_name.lower().endswith("_polygon_shp.dbf"):
                            spatial = True
                            bdy_file = os.path.join(
                                root, file_name.replace(".dbf", ".shp"))
                        else:
                            spatial = False
                            bdy_file = os.path.join(root, file_name)

                        bdy_table = file_name.lower().replace(
                            state + "_", "aus_", 1).replace("_shp.dbf", "")

                        # set command line parameters depending on whether this is the 1st state (for creating tables)
                        table_list_add = False

                        if bdy_table not in table_list:
                            table_list_add = True

                            if spatial:
                                params = "-d -D -s 4283 -i"
                            else:
                                params = "-d -D -G -n -i"
                        else:
                            if spatial:
                                params = "-a -D -s 4283 -i"
                            else:
                                params = "-a -D -G -n -i"

                        cmd = "{0}shp2pgsql {1} \"{2}\" {3}.{4} | {5}".format(
                            password_str, params, bdy_file,
                            settings['raw_admin_bdys_schema'], bdy_table,
                            psql_str)

                        # if locality file from Towns folder: don't add - it's a duplicate
                        if "town points" not in bdy_file.lower():
                            if table_list_add:
                                table_list.append(bdy_table)
                                cmd_list1.append(cmd)
                            else:
                                cmd_list2.append(cmd)
                        else:
                            if not bdy_file.lower().endswith(
                                    "_locality_shp.dbf"):
                                if table_list_add:
                                    table_list.append(bdy_table)
                                    cmd_list1.append(cmd)
                                else:
                                    cmd_list2.append(cmd)

    # logger.info('\n'.join(cmd_list1)
    # logger.info('\n'.join(cmd_list2)

    # are there any files to load?
    if len(cmd_list1) == 0:
        logger.fatal(
            "No Admin Boundary files found\nACTION: Check your 'admin-bdys-path' argument"
        )
    else:
        # load files in separate processes -
        # do the commands that create the tables first before attempting the subsequent insert commands
        psma.multiprocess_list("cmd", cmd_list1, settings, logger)
        psma.multiprocess_list("cmd", cmd_list2, settings, logger)
        logger.info(
            "\t- Step 1 of 3 : raw admin boundaries loaded : {0}".format(
                datetime.now() - start_time))