def create_admin_bdys_for_analysis(settings): # Step 3 of 3 : create admin bdy tables optimised for spatial analysis start_time = datetime.now() if settings['st_subdivide_supported']: template_sql = psma.open_sql_file( "02-03-create-admin-bdy-analysis-tables_template.sql", settings) sql_list = list() for table in settings['admin_bdy_list']: sql = template_sql.format(table[0], table[1]) if table[ 0] == 'locality_bdys': # special case, need to change schema name # sql = sql.replace(settings['raw_admin_bdys_schema'], settings['admin_bdys_schema']) sql = sql.replace("name", "locality_name") # add postcodes sql = sql.replace( "locality_name text NOT NULL,", "locality_name text NOT NULL, postcode text NULL,") sql = sql.replace("locality_name,", "locality_name, postcode,") sql_list.append(sql) psma.multiprocess_list("sql", sql_list, settings, logger) logger.info( "\t- Step 3 of 3 : admin boundaries for analysis created : {0}". format(datetime.now() - start_time)) else: logger.warning( "\t- Step 3 of 3 : admin boundaries for analysis NOT created - " "requires PostGIS 2.2+ with GEOS 3.5.0+")
def get_split_localities(pg_cur, settings): start_time = datetime.now() sql = psma.open_sql_file("02-split-localities-by-state-borders.sql", settings) sql_list = psma.split_sql_into_list(pg_cur, sql, settings['admin_bdys_schema'], "temp_localities", "loc", "gid", settings, logger) psma.multiprocess_list("sql", sql_list, settings, logger) logger.info("\t- Step 2 of 7 : localities split by state : {0}".format(datetime.now() - start_time))
def get_locality_state_border_gaps(pg_cur, settings): start_time = datetime.now() sql = psma.open_sql_file("04-create-holes-along-borders.sql", settings) sql_list = psma.split_sql_into_list(pg_cur, sql, settings['admin_bdys_schema'], "temp_state_border_buffers_subdivided", "ste", "new_gid", settings, logger) psma.multiprocess_list("sql", sql_list, settings, logger) logger.info("\t- Step 4 of 7 : locality holes created : {0}".format(datetime.now() - start_time))
def create_primary_foreign_keys(settings): start_time = datetime.now() key_sql = open( os.path.join(settings['sql_dir'], "01-06-raw-gnaf-create-primary-foreign-keys.sql"), "r").read() key_sql_list = key_sql.split("--") sql_list = [] for sql in key_sql_list: sql = sql.strip() if sql[0:6] == "ALTER ": # add schema to tables names, in case raw gnaf schema not the default sql = sql.replace( "ALTER TABLE ONLY ", "ALTER TABLE ONLY " + settings['raw_gnaf_schema'] + ".") sql_list.append(sql) sql_list = [] # run queries in separate processes psma.multiprocess_list("sql", sql_list, settings, logger) logger.info( "\t- Step 6 of 7 : primary & foreign keys created : {0}".format( datetime.now() - start_time))
def get_locality_state_border_gaps(pg_cur, settings): start_time = datetime.now() sql = psma.open_sql_file("04-create-holes-along-borders.sql", settings) sql_list = psma.split_sql_into_list( pg_cur, sql, settings['admin_bdys_schema'], "temp_state_border_buffers_subdivided", "ste", "new_gid", settings, logger) psma.multiprocess_list("sql", sql_list, settings, logger) logger.info("\t- Step 4 of 7 : locality holes created : {0}".format( datetime.now() - start_time))
def create_states_and_prep_localities(settings): start_time = datetime.now() sql_list = [ psma.open_sql_file("01a-create-states-from-sa4s.sql", settings), psma.open_sql_file("01b-prep-locality-boundaries.sql", settings) ] psma.multiprocess_list("sql", sql_list, settings, logger) logger.info( "\t- Step 1 of 7 : state table created & localities prepped : {0}". format(datetime.now() - start_time))
def get_split_localities(pg_cur, settings): start_time = datetime.now() sql = psma.open_sql_file("02-split-localities-by-state-borders.sql", settings) sql_list = psma.split_sql_into_list(pg_cur, sql, settings['admin_bdys_schema'], "temp_localities", "loc", "gid", settings, logger) psma.multiprocess_list("sql", sql_list, settings, logger) logger.info("\t- Step 2 of 7 : localities split by state : {0}".format( datetime.now() - start_time))
def index_raw_gnaf(settings): # Step 5 of 7 : create indexes start_time = datetime.now() raw_sql_list = psma.open_sql_file("01-05-raw-gnaf-create-indexes.sql", settings).split("\n") sql_list = [] for sql in raw_sql_list: if sql[0:2] != "--" and sql[0:2] != "": sql_list.append(sql) psma.multiprocess_list("sql", sql_list, settings, logger) logger.info("\t- Step 5 of 7 : indexes created: {0}".format(datetime.now() - start_time))
def prep_admin_bdys(pg_cur, settings): # Step 2 of 3 : create admin bdy tables read to be used start_time = datetime.now() if settings['admin_bdys_schema'] != "public": pg_cur.execute( "CREATE SCHEMA IF NOT EXISTS {0} AUTHORIZATION {1}".format( settings['admin_bdys_schema'], settings['pg_user'])) # create tables using multiprocessing - using flag in file to split file up into sets of statements sql_list = psma.open_sql_file("02-02a-prep-admin-bdys-tables.sql", settings).split("-- # --") sql_list = sql_list + psma.open_sql_file( "02-02b-prep-census-2011-bdys-tables.sql", settings).split("-- # --") sql_list = sql_list + psma.open_sql_file( "02-02c-prep-census-2016-bdys-tables.sql", settings).split("-- # --") # # Account for bdys that are not in states to load - not yet working # for sql in sql_list: # if settings['states_to_load'] == ['OT'] and '.commonwealth_electorates ' in sql: # sql_list.remove(sql) # # if settings['states_to_load'] == ['ACT'] and '.local_government_areas ' in sql: # sql_list.remove(sql) # # logger.info(settings['states_to_load'] # # if not ('NT' in settings['states_to_load'] or 'SA' in settings['states_to_load'] # or 'VIC' in settings['states_to_load'] or 'WA' in settings['states_to_load']) \ # and '.local_government_wards ' in sql: # sql_list.remove(sql) # # if settings['states_to_load'] == ['OT'] and '.state_lower_house_electorates ' in sql: # sql_list.remove(sql) # # if not ('TAS' in settings['states_to_load'] or 'VIC' in settings['states_to_load'] # or 'WA' in settings['states_to_load']) and '.state_upper_house_electorates ' in sql: # sql_list.remove(sql) psma.multiprocess_list("sql", sql_list, settings, logger) # Special case - remove custom outback bdy if South Australia not requested if 'SA' not in settings['states_to_load']: pg_cur.execute( psma.prep_sql( "DELETE FROM admin_bdys.locality_bdys WHERE locality_pid = 'SA999999'", settings)) pg_cur.execute( psma.prep_sql("VACUUM ANALYZE admin_bdys.locality_bdys", settings)) logger.info("\t- Step 2 of 3 : admin boundaries prepped : {0}".format( datetime.now() - start_time))
def analyse_raw_gnaf_tables(pg_cur, settings): start_time = datetime.now() # get list of tables that haven't been analysed (i.e. that have no real row count) sql = "SELECT nspname|| '.' || relname AS table_name " \ "FROM pg_class C LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)" \ "WHERE nspname = '{0}' AND relkind='r' AND reltuples = 0".format(settings['raw_gnaf_schema']) pg_cur.execute(sql) sql_list = [] for pg_row in pg_cur: sql_list.append("ANALYZE {0}".format(pg_row[0])) # run queries in separate processes psma.multiprocess_list("sql", sql_list, settings, logger) logger.info("\t- Step 7 of 7 : tables analysed : {0}".format(datetime.now() - start_time))
def populate_raw_gnaf(settings): # Step 4 of 7 : load raw gnaf authority & state tables start_time = datetime.now() # authority code file list sql_list = get_raw_gnaf_files("authority_code", settings) # add state file lists for state in settings['states_to_load']: logger.info("\t\t- Loading state {}".format(state)) sql_list.extend(get_raw_gnaf_files(state, settings)) # are there any files to load? if len(sql_list) == 0: logger.fatal("No raw GNAF PSV files found\nACTION: Check your 'gnaf_network_directory' path") logger.fatal("\t- Step 4 of 7 : table populate FAILED!") else: # load all PSV files using multiprocessing psma.multiprocess_list("sql", sql_list, settings, logger) logger.info("\t- Step 4 of 7 : tables populated : {0}".format(datetime.now() - start_time))
def boundary_tag_gnaf(pg_cur, settings): # create bdy table list # remove localities, postcodes and states as these IDs are already assigned to GNAF addresses table_list = list() for table in settings['admin_bdy_list']: if table[0] not in ["locality_bdys", "postcode_bdys", "state_bdys"]: # if no analysis tables created - use the full tables instead of the subdivided ones # WARNING: this can add hours to the processing if settings['st_subdivide_supported']: table_name = "{}_analysis".format(table[0], ) else: table_name = table[0] table_list.append([table_name, table[1]]) # create bdy tagged address tables for address_table in ["address_principal", "address_alias"]: pg_cur.execute( "DROP TABLE IF EXISTS {}.{}_admin_boundaries CASCADE".format( settings['gnaf_schema'], address_table)) create_table_list = list() create_table_list.append( "CREATE TABLE {}.{}_admin_boundaries (gid serial NOT NULL," "gnaf_pid text NOT NULL," # "alias_principal character(1) NOT NULL," "locality_pid text NOT NULL," "locality_name text NOT NULL," "postcode text," "state text NOT NULL".format(settings['gnaf_schema'], address_table)) for table in table_list: pid_field = table[1] name_field = pid_field.replace("_pid", "_name") create_table_list.append(", {} text, {} text".format( pid_field, name_field)) create_table_list.append( ") WITH (OIDS=FALSE);ALTER TABLE {}.{}_admin_boundaries OWNER TO {}" .format(settings['gnaf_schema'], address_table, settings['pg_user'])) pg_cur.execute("".join(create_table_list)) # Step 1 of 6 : tag gnaf addresses with admin boundary IDs, using multiprocessing start_time = datetime.now() # create temp tables template_sql = psma.open_sql_file( "04-01a-bdy-tag-create-table-template.sql", settings) for table in table_list: pg_cur.execute(template_sql.format(table[0], )) # create temp tables of bdy tagged gnaf_pids template_sql = psma.open_sql_file("04-01b-bdy-tag-template.sql", settings) sql_list = list() for table in table_list: sql = template_sql.format(table[0], table[1]) short_sql_list = psma.split_sql_into_list( pg_cur, sql, settings['admin_bdys_schema'], table[0], "bdys", "gid", settings, logger) if short_sql_list is not None: sql_list.extend(short_sql_list) # logger.info('\n'.join(sql_list)) if sql_list is not None: psma.multiprocess_list("sql", sql_list, settings, logger) logger.info( "\t- Step 1 of 6 : principal addresses tagged with admin boundary IDs: {}" .format(datetime.now() - start_time, )) start_time = datetime.now() # Step 2 of 6 : delete invalid matches, create indexes and analyse tables sql_list = list() for table in table_list: sql = "DELETE FROM {0}.temp_{1}_tags WHERE gnaf_state <> bdy_state AND gnaf_state <> 'OT';" \ "CREATE INDEX temp_{1}_tags_gnaf_pid_idx ON {0}.temp_{1}_tags USING btree(gnaf_pid);" \ "ANALYZE {0}.temp_{1}_tags".format(settings['gnaf_schema'], table[0]) sql_list.append(sql) psma.multiprocess_list("sql", sql_list, settings, logger) logger.info( "\t- Step 2 of 6 : principal addresses - invalid matches deleted & bdy tag indexes created : {}" .format(datetime.now() - start_time, )) start_time = datetime.now() # Step 3 of 6 : insert boundary tagged addresses # create insert statement for multiprocessing insert_field_list = list() insert_field_list.append( "(gnaf_pid, locality_pid, locality_name, postcode, state") insert_join_list = list() insert_join_list.append("FROM {}.address_principals AS pnts ".format( settings['gnaf_schema'], )) select_field_list = list() select_field_list.append("SELECT pnts.gnaf_pid, pnts.locality_pid, " "pnts.locality_name, pnts.postcode, pnts.state") drop_table_list = list() for table in table_list: pid_field = table[1] name_field = pid_field.replace("_pid", "_name") insert_field_list.append(", {0}, {1}".format(pid_field, name_field)) select_field_list.append( ", temp_{0}_tags.bdy_pid, temp_{0}_tags.bdy_name ".format( table[0])) insert_join_list.append( "LEFT OUTER JOIN {0}.temp_{1}_tags ON pnts.gnaf_pid = temp_{1}_tags.gnaf_pid " .format(settings['gnaf_schema'], table[0])) drop_table_list.append( "DROP TABLE IF EXISTS {0}.temp_{1}_tags;".format( settings['gnaf_schema'], table[0])) insert_field_list.append(") ") insert_statement_list = list() insert_statement_list.append( "INSERT INTO {0}.address_principal_admin_boundaries ".format( settings['gnaf_schema'], )) insert_statement_list.append("".join(insert_field_list)) insert_statement_list.append("".join(select_field_list)) insert_statement_list.append("".join(insert_join_list)) sql = "".join(insert_statement_list) + ";" sql_list = psma.split_sql_into_list(pg_cur, sql, settings['gnaf_schema'], "address_principals", "pnts", "gid", settings, logger) # logger.info("\n".join(sql_list) if sql_list is not None: psma.multiprocess_list("sql", sql_list, settings, logger) # drop temp tables pg_cur.execute("".join(drop_table_list)) # get stats pg_cur.execute("ANALYZE {0}.address_principal_admin_boundaries ".format( settings['gnaf_schema'])) logger.info( "\t- Step 3 of 6 : principal addresses - bdy tags added to output table : {}" .format(datetime.now() - start_time, )) start_time = datetime.now() # Step 4 of 6 : add index to output table sql = "CREATE INDEX address_principal_admin_boundaries_gnaf_pid_idx " \ "ON {0}.address_principal_admin_boundaries USING btree (gnaf_pid)"\ .format(settings['gnaf_schema']) pg_cur.execute(sql) logger.info( "\t- Step 4 of 6 : created index on bdy tagged address table : {}". format(datetime.now() - start_time, )) start_time = datetime.now() # Step 5 of 6 : log duplicates - happens when 2 boundaries overlap by a very small amount # (can be ignored if there's a small number of records affected) sql = "SELECT gnaf_pid FROM (SELECT Count(*) AS cnt, gnaf_pid FROM {0}.address_principal_admin_boundaries " \ "GROUP BY gnaf_pid) AS sqt WHERE cnt > 1".format(settings['gnaf_schema']) pg_cur.execute(sql) # get cursor description to test if any rows returned safely columns = pg_cur.description # log gnaf_pids that got duplicate results if columns is not None: duplicates = pg_cur.fetchall() gnaf_pids = list() for duplicate in duplicates: gnaf_pids.append("\t\t" + duplicate[0]) if len(gnaf_pids) > 0: logger.warning( "\t- Step 5 of 6 : found boundary tag duplicates : {}".format( datetime.now() - start_time, )) logger.warning("\n".join(gnaf_pids)) else: logger.info( "\t- Step 5 of 6 : no boundary tag duplicates : {}".format( datetime.now() - start_time, )) else: logger.info("\t- Step 5 of 6 : no boundary tag duplicates : {}".format( datetime.now() - start_time, )) # Step 6 of 6 : Copy principal boundary tags to alias addresses pg_cur.execute( psma.open_sql_file("04-06-bdy-tags-for-alias-addresses.sql", settings)) logger.info( "\t- Step 6 of 6 : alias addresses boundary tagged : {}".format( datetime.now() - start_time, )) # Step 7 of 7 : Create view of all bdy tags pg_cur.execute( psma.open_sql_file("04-07-create-bdy-tag-view.sql", settings)) logger.info( "\t- Step 6 of 6 : boundary tagged address view created : {}".format( datetime.now() - start_time, ))
def create_reference_tables(pg_cur, settings): # set postgres search path back to the default pg_cur.execute("SET search_path = public, pg_catalog") # Step 1 of 14 : create reference tables start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-01-reference-create-tables.sql", settings)) logger.info("\t- Step 1 of 14 : create reference tables : {0}".format( datetime.now() - start_time)) # Step 2 of 14 : populate localities start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-02-reference-populate-localities.sql", settings)) logger.info("\t- Step 2 of 14 : localities populated : {0}".format( datetime.now() - start_time)) # Step 3 of 14 : populate locality aliases start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-03-reference-populate-locality-aliases.sql", settings)) logger.info("\t- Step 3 of 14 : locality aliases populated : {0}".format( datetime.now() - start_time)) # Step 4 of 14 : populate locality neighbours start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-04-reference-populate-locality-neighbours.sql", settings)) logger.info( "\t- Step 4 of 14 : locality neighbours populated : {0}".format( datetime.now() - start_time)) # Step 5 of 14 : populate streets start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-05-reference-populate-streets.sql", settings)) logger.info( "\t- Step 5 of 14 : streets populated : {0}".format(datetime.now() - start_time)) # Step 6 of 14 : populate street aliases start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-06-reference-populate-street-aliases.sql", settings)) logger.info("\t- Step 6 of 14 : street aliases populated : {0}".format( datetime.now() - start_time)) # Step 7 of 14 : populate addresses, using multiprocessing start_time = datetime.now() sql = psma.open_sql_file("03-07-reference-populate-addresses-1.sql", settings) sql_list = psma.split_sql_into_list(pg_cur, sql, settings['gnaf_schema'], "streets", "str", "gid", settings, logger) if sql_list is not None: psma.multiprocess_list('sql', sql_list, settings, logger) pg_cur.execute(psma.prep_sql("ANALYZE gnaf.temp_addresses;", settings)) logger.info( "\t- Step 7 of 14 : addresses populated : {0}".format(datetime.now() - start_time)) # Step 8 of 14 : populate principal alias lookup start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-08-reference-populate-address-alias-lookup.sql", settings)) logger.info( "\t- Step 8 of 14 : principal alias lookup populated : {0}".format( datetime.now() - start_time)) # Step 9 of 14 : populate primary secondary lookup start_time = datetime.now() pg_cur.execute( psma.open_sql_file( "03-09-reference-populate-address-secondary-lookup.sql", settings)) pg_cur.execute( psma.prep_sql("VACUUM ANALYSE gnaf.address_secondary_lookup", settings)) logger.info( "\t- Step 9 of 14 : primary secondary lookup populated : {0}".format( datetime.now() - start_time)) # Step 10 of 14 : split the Melbourne locality into its 2 postcodes (3000, 3004) start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-10-reference-split-melbourne.sql", settings)) logger.info( "\t- Step 10 of 14 : Melbourne split : {0}".format(datetime.now() - start_time)) # Step 11 of 14 : finalise localities assigned to streets and addresses start_time = datetime.now() pg_cur.execute( psma.open_sql_file("03-11-reference-finalise-localities.sql", settings)) logger.info("\t- Step 11 of 14 : localities finalised : {0}".format( datetime.now() - start_time)) # Step 12 of 14 : finalise addresses, using multiprocessing start_time = datetime.now() sql = psma.open_sql_file("03-12-reference-populate-addresses-2.sql", settings) sql_list = psma.split_sql_into_list(pg_cur, sql, settings['gnaf_schema'], "localities", "loc", "gid", settings, logger) if sql_list is not None: psma.multiprocess_list('sql', sql_list, settings, logger) # turf the temp address table pg_cur.execute( psma.prep_sql("DROP TABLE IF EXISTS gnaf.temp_addresses", settings)) logger.info( "\t- Step 12 of 14 : addresses finalised : {0}".format(datetime.now() - start_time)) # Step 13 of 14 : create almost correct postcode boundaries by aggregating localities, using multiprocessing start_time = datetime.now() sql = psma.open_sql_file("03-13-reference-derived-postcode-bdys.sql", settings) sql_list = [] for state in settings['states_to_load']: state_sql = sql.replace("GROUP BY ", "WHERE state = '{0}' GROUP BY ".format(state)) sql_list.append(state_sql) psma.multiprocess_list("sql", sql_list, settings, logger) # create analysis table? if settings['st_subdivide_supported']: pg_cur.execute( psma.open_sql_file("03-13a-create-postcode-analysis-table.sql", settings)) logger.info("\t- Step 13 of 14 : postcode boundaries created : {0}".format( datetime.now() - start_time)) # Step 14 of 14 : create indexes, primary and foreign keys, using multiprocessing start_time = datetime.now() raw_sql_list = psma.open_sql_file("03-14-reference-create-indexes.sql", settings).split("\n") sql_list = [] for sql in raw_sql_list: if sql[0:2] != "--" and sql[0:2] != "": sql_list.append(sql) psma.multiprocess_list("sql", sql_list, settings, logger) logger.info( "\t- Step 14 of 14 : create primary & foreign keys and indexes : {0}". format(datetime.now() - start_time))
def create_states_and_prep_localities(settings): start_time = datetime.now() sql_list = [psma.open_sql_file("01a-create-states-from-sa4s.sql", settings), psma.open_sql_file("01b-prep-locality-boundaries.sql", settings)] psma.multiprocess_list("sql", sql_list, settings, logger) logger.info("\t- Step 1 of 7 : state table created & localities prepped : {0}".format(datetime.now() - start_time))
def boundary_tag_gnaf(pg_cur, settings): # create bdy table list # remove localities, postcodes and states as these IDs are already assigned to GNAF addresses table_list = list() for table in settings['admin_bdy_list']: if table[0] not in ["locality_bdys", "postcode_bdys", "state_bdys"]: # if no analysis tables created - use the full tables instead of the subdivided ones # WARNING: this can add hours to the processing if settings['st_subdivide_supported']: table_name = "{0}_analysis".format(table[0], ) else: table_name = table[0] table_list.append([table_name, table[1]]) # create bdy tagged address table pg_cur.execute( "DROP TABLE IF EXISTS {0}.address_admin_boundaries CASCADE".format( settings['gnaf_schema'], )) create_table_list = list() create_table_list.append( "CREATE TABLE {0}.address_admin_boundaries (gid serial NOT NULL," "gnaf_pid character varying(16) NOT NULL," "alias_principal character(1) NOT NULL," "locality_pid character varying(16) NOT NULL," "locality_name character varying(100) NOT NULL," "postcode character varying(4)," "state character varying(3) NOT NULL".format( settings['gnaf_schema'], )) for table in table_list: pid_field = table[1] name_field = pid_field.replace("_pid", "_name") create_table_list.append( ", {0} character varying(15), {1} character varying(100)".format( pid_field, name_field)) create_table_list.append( ") WITH (OIDS=FALSE);ALTER TABLE {0}.address_admin_boundaries OWNER TO {1}" .format(settings['gnaf_schema'], settings['pg_user'])) pg_cur.execute("".join(create_table_list)) i = 0 for address_table in ["address_principals", "address_aliases"]: # Step 1/4 of 8 : tag gnaf addresses with admin boundary IDs, using multiprocessing start_time = datetime.now() # create temp tables template_sql = psma.open_sql_file( "04-01a-bdy-tag-create-table-template.sql", settings) for table in table_list: pg_cur.execute(template_sql.format(table[0], )) # create temp tables of bdy tagged gnaf_pids template_sql = psma.open_sql_file("04-01b-bdy-tag-template.sql", settings) sql_list = list() for table in table_list: sql = template_sql.format(table[0], table[1]) short_sql_list = psma.split_sql_into_list( pg_cur, sql, settings['admin_bdys_schema'], table[0], "bdys", "gid", settings, logger) if short_sql_list is not None: sql_list.extend(short_sql_list) # logger.info('\n'.join(sql_list)) if sql_list is not None: psma.multiprocess_list("sql", sql_list, settings, logger) i += 1 logger.info( "\t- Step {0} of 8 : {1} - gnaf addresses tagged with admin boundary IDs: {2}" .format(i, address_table, datetime.now() - start_time)) start_time = datetime.now() # Step 2/5 of 8 : delete invalid matches, create indexes and analyse tables sql_list = list() for table in table_list: sql = "DELETE FROM {0}.temp_{1}_tags WHERE gnaf_state <> bdy_state AND gnaf_state <> 'OT';" \ "CREATE INDEX temp_{1}_tags_gnaf_pid_idx ON {0}.temp_{1}_tags USING btree(gnaf_pid);" \ "ANALYZE {0}.temp_{1}_tags".format(settings['gnaf_schema'], table[0]) sql_list.append(sql) psma.multiprocess_list("sql", sql_list, settings, logger) i += 1 logger.info( "\t- Step {0} of 8 : {1} - invalid matches deleted & bdy tag indexes created : {2}" .format(i, address_table, datetime.now() - start_time)) start_time = datetime.now() # Step 3/6 of 8 : insert boundary tagged addresses # create insert statement for multiprocessing insert_field_list = list() insert_field_list.append( "(gnaf_pid, alias_principal, locality_pid, locality_name, postcode, state" ) insert_join_list = list() insert_join_list.append("FROM {0}.{1} AS pnts ".format( settings['gnaf_schema'], address_table)) select_field_list = list() select_field_list.append( "SELECT pnts.gnaf_pid, pnts.alias_principal, pnts.locality_pid, " "pnts.locality_name, pnts.postcode, pnts.state") drop_table_list = list() for table in table_list: pid_field = table[1] name_field = pid_field.replace("_pid", "_name") insert_field_list.append(", {0}, {1}".format( pid_field, name_field)) select_field_list.append( ", temp_{0}_tags.bdy_pid, temp_{0}_tags.bdy_name ".format( table[0])) insert_join_list.append( "LEFT OUTER JOIN {0}.temp_{1}_tags ON pnts.gnaf_pid = temp_{1}_tags.gnaf_pid " .format(settings['gnaf_schema'], table[0])) drop_table_list.append( "DROP TABLE IF EXISTS {0}.temp_{1}_tags;".format( settings['gnaf_schema'], table[0])) insert_field_list.append(") ") insert_statement_list = list() insert_statement_list.append( "INSERT INTO {0}.address_admin_boundaries ".format( settings['gnaf_schema'], )) insert_statement_list.append("".join(insert_field_list)) insert_statement_list.append("".join(select_field_list)) insert_statement_list.append("".join(insert_join_list)) sql = "".join(insert_statement_list) + ";" sql_list = psma.split_sql_into_list(pg_cur, sql, settings['gnaf_schema'], address_table, "pnts", "gid", settings, logger) # logger.info("\n".join(sql_list) if sql_list is not None: psma.multiprocess_list("sql", sql_list, settings, logger) # drop temp tables pg_cur.execute("".join(drop_table_list)) # get stats pg_cur.execute("ANALYZE {0}.address_admin_boundaries ".format( settings['gnaf_schema'])) i += 1 logger.info( "\t- Step {0} of 8 : {1} - bdy tags added to output table : {2}". format(i, address_table, datetime.now() - start_time)) start_time = datetime.now() # Step 7 of 8 : add index to output table sql = "CREATE INDEX address_admin_boundaries_gnaf_pid_idx ON {0}.address_admin_boundaries USING btree (gnaf_pid)"\ .format(settings['gnaf_schema']) pg_cur.execute(sql) i += 1 logger.info( "\t- Step {0} of 8 : created index on bdy tagged address table : {1}". format(i, datetime.now() - start_time)) start_time = datetime.now() # Step 8 of 8 : log duplicates - happens when 2 boundaries overlap by a very small amount # (can be ignored if there's a small number of records affected) sql = "SELECT gnaf_pid FROM (SELECT Count(*) AS cnt, gnaf_pid FROM {0}.address_admin_boundaries " \ "GROUP BY gnaf_pid) AS sqt WHERE cnt > 1".format(settings['gnaf_schema']) pg_cur.execute(sql) i += 1 try: duplicates = pg_cur.fetchall() gnaf_pids = list() for duplicate in duplicates: gnaf_pids.append("\t\t" + duplicate[0]) logger.warning( "\t- Step {0} of 8 : found boundary tag duplicates : {1}".format( i, datetime.now() - start_time)) logger.warning("\n".join(gnaf_pids)) except psycopg2.Error: logger.info( "\t- Step {0} of 8 : no boundary tag duplicates : {1}".format( i, datetime.now() - start_time))
def load_raw_admin_boundaries(pg_cur, settings): start_time = datetime.now() # drop existing views pg_cur.execute( psma.open_sql_file("02-01-drop-admin-bdy-views.sql", settings)) # add locality class authority code table settings['states_to_load'].extend(["authority_code"]) # create schema if settings['raw_admin_bdys_schema'] != "public": pg_cur.execute( "CREATE SCHEMA IF NOT EXISTS {0} AUTHORIZATION {1}".format( settings['raw_admin_bdys_schema'], settings['pg_user'])) # set psql connect string and password psql_str = "psql -U {0} -d {1} -h {2} -p {3}"\ .format(settings['pg_user'], settings['pg_db'], settings['pg_host'], settings['pg_port']) password_str = '' if not os.getenv("PGPASSWORD"): if platform.system() == "Windows": password_str = "SET" else: password_str = "export" password_str += " PGPASSWORD={0}&&".format(settings['pg_password']) # get file list table_list = [] cmd_list1 = [] cmd_list2 = [] for state in settings['states_to_load']: state = state.lower() # get a dictionary of Shapefiles and DBFs matching the state for root, dirs, files in os.walk( settings['admin_bdys_local_directory']): for file_name in files: if file_name.lower().startswith(state + "_"): if file_name.lower().endswith("_shp.dbf"): # change file type for spatial files if file_name.lower().endswith("_polygon_shp.dbf"): spatial = True bdy_file = os.path.join( root, file_name.replace(".dbf", ".shp")) else: spatial = False bdy_file = os.path.join(root, file_name) bdy_table = file_name.lower().replace( state + "_", "aus_", 1).replace("_shp.dbf", "") # set command line parameters depending on whether this is the 1st state (for creating tables) table_list_add = False if bdy_table not in table_list: table_list_add = True if spatial: params = "-d -D -s 4283 -i" else: params = "-d -D -G -n -i" else: if spatial: params = "-a -D -s 4283 -i" else: params = "-a -D -G -n -i" cmd = "{0}shp2pgsql {1} \"{2}\" {3}.{4} | {5}".format( password_str, params, bdy_file, settings['raw_admin_bdys_schema'], bdy_table, psql_str) # if locality file from Towns folder: don't add - it's a duplicate if "town points" not in bdy_file.lower(): if table_list_add: table_list.append(bdy_table) cmd_list1.append(cmd) else: cmd_list2.append(cmd) else: if not bdy_file.lower().endswith( "_locality_shp.dbf"): if table_list_add: table_list.append(bdy_table) cmd_list1.append(cmd) else: cmd_list2.append(cmd) # logger.info('\n'.join(cmd_list1) # logger.info('\n'.join(cmd_list2) # are there any files to load? if len(cmd_list1) == 0: logger.fatal( "No Admin Boundary files found\nACTION: Check your 'admin-bdys-path' argument" ) else: # load files in separate processes - # do the commands that create the tables first before attempting the subsequent insert commands psma.multiprocess_list("cmd", cmd_list1, settings, logger) psma.multiprocess_list("cmd", cmd_list2, settings, logger) logger.info( "\t- Step 1 of 3 : raw admin boundaries loaded : {0}".format( datetime.now() - start_time))