def dump(overlaps, aggregate): """Dump output designatedlands table to file """ if aggregate: if overlaps: util.log('ignoring --overlaps flag') geoutil.dump_aggregate(config, 'designatedlands_agg') else: if overlaps: config['out_table'] = config['out_table'] + '_overlaps' db = pgdata.connect(config["db_url"], schema="public") util.log('Dumping %s to %s' % (config['out_table'], config['out_file'])) columns = [ c for c in db[config['out_table']].columns if c != 'geom' and 'prelim' not in c ] ogr_sql = """SELECT {cols}, st_collectionextract(st_safe_repair(st_snaptogrid(geom, .001)), 3) as geom FROM {t} WHERE designation IS NOT NULL """.format(cols=",".join(columns), t=config['out_table']) util.log(ogr_sql) db = pgdata.connect(config["db_url"]) db.pg2ogr( ogr_sql, config['out_format'], config['out_file'], config['out_table'], geom_type="MULTIPOLYGON", )
def overlay(in_file, in_layer, dump_file, new_layer_name): """Intersect layer with designatedlands """ # load in_file to postgres db = pgdata.connect(config['db_url'], schema="public") if not in_layer: in_layer = fiona.listlayers(in_file)[0] if not new_layer_name: new_layer_name = in_layer[:63] # Maximum table name length is 63 out_layer = new_layer_name[:50] + "_overlay" db.ogr2pg(in_file, in_layer=in_layer, out_layer=new_layer_name) # pull distinct tiles iterable into a list tiles = [t for t in db["tiles"].distinct('map_tile')] # uncomment and adjust for debugging a specific tile # tiles = [t for t in tiles if t[:4] == '092K'] util.log("Intersecting %s with %s" % (config['out_table'], new_layer_name)) geoutil.intersect( db, config['out_table'], new_layer_name, out_layer, config['n_processes'], tiles, ) # dump result to file if dump_file: util.log("Dumping intersect to file %s " % config['out_file']) dump(out_layer, config['out_file'], config['out_format'])
def create_temp_table(): # create new table db = pgdata.connect() db.execute("DROP TABLE IF EXISTS whse_basemapping.temp_upstream_wb_area_ha") db.execute( """ CREATE TABLE whse_basemapping.temp_upstream_wb_area_ha (linear_feature_id bigint primary key, upstream_lake_ha double precision, upstream_reservoir_ha double precision, upstream_wetland_ha double precision) """ ) groups = [ r[0] for r in db.query( """ SELECT watershed_group_code FROM whse_basemapping.fwa_watershed_groups_poly WHERE watershed_group_code in ('BULK','MORR','HORS','LNIC','ELKR') ORDER BY watershed_group_code """ ) ] # use a raw connection so that our queries are not wrapped in transactions - # postgres will therefore clear the on disk temp files and we won't run out of disk space... conn = psycopg2.connect("") # insert per watershed group for wsg in groups: print(wsg) cur = conn.cursor() cur.execute(db.queries["temp_upstream_wb_area_ha"], (wsg,)) conn.commit() cur.close()
def segment_streams(stream_table, point_table): """ Break streams at points. Break lines in stream_table at each location specified in point_table Points are defined as blue_line_key / route measure (not geom). Each table must have the standard FWA columns. """ db = pgdata.connect() stream_schema, stream_table = db.parse_table_name(stream_table) point_schema, point_table = db.parse_table_name(point_table) groups = [ g[0] for g in db.query( "SELECT DISTINCT watershed_group_code FROM bcfishpass.streams") ] query = sql.SQL(read_file("sql/00_segment_streams.sql")).format( stream_schema=sql.Identifier(stream_schema), stream_table=sql.Identifier(stream_table), point_schema=sql.Identifier(point_schema), point_table=sql.Identifier(point_table), ) func = partial(execute_parallel, query) n_processes = multiprocessing.cpu_count() - 1 pool = multiprocessing.Pool(processes=n_processes) pool.map(func, groups) pool.close() pool.join()
def apply_updates(): """apply updates rather than re-loading to another temp table""" db = pgdata.connect() # add new columns if not present db.execute("ALTER TABLE whse_basemapping.fwa_stream_networks_sp ADD COLUMN IF NOT EXISTS upstream_lake_ha double precision") db.execute("ALTER TABLE whse_basemapping.fwa_stream_networks_sp ADD COLUMN IF NOT EXISTS upstream_reservoir_ha double precision") db.execute("ALTER TABLE whse_basemapping.fwa_stream_networks_sp ADD COLUMN IF NOT EXISTS upstream_wetland_ha double precision") groups = [ r[0] for r in db.query( """ SELECT watershed_group_code FROM whse_basemapping.fwa_watershed_groups_poly WHERE watershed_group_code in ('BULK','MORR','HORS','LNIC','ELKR') ORDER BY watershed_group_code """ ) ] conn = psycopg2.connect("") for wsg in groups: print(wsg) cur = conn.cursor() cur.execute(db.queries["update_upstream_wb_area_ha"], (wsg,)) conn.commit() cur.close()
def test_create_table(): db = connect(URL, schema="pgdata") columns = [Column('user_id', Integer, primary_key=True), Column('user_name', UnicodeText, nullable=False), Column('email_address', UnicodeText), Column('password', UnicodeText, nullable=False)] employees = db.create_table("employees", columns) assert employees.table.exists()
def add_downstream_ids( table_a, id_a, table_b, id_b, downstream_ids_col, include_equivalent_measure ): """note downstream ids """ db = pgdata.connect() schema_a, table_a = db.parse_table_name(table_a) schema_b, table_b = db.parse_table_name(table_b) # ensure that any existing values get removed db.execute( f"ALTER TABLE {schema_a}.{table_a} DROP COLUMN IF EXISTS {downstream_ids_col}" ) temp_table = table_a + "_tmp" db[f"{schema_a}.{temp_table}"].drop() db.execute(f"CREATE TABLE {schema_a}.{temp_table} (LIKE {schema_a}.{table_a})") db.execute( f"ALTER TABLE {schema_a}.{temp_table} ADD COLUMN {downstream_ids_col} integer[]" ) groups = sorted( [ g[0] for g in db.query( f"SELECT DISTINCT watershed_group_code from {schema_a}.{table_a}" ) ] ) # todo - is this really the best way to specify which query to use? if include_equivalent_measure: q = "sql/00_add_downstream_and_equivalent_ids.sql" else: q = "sql/00_add_downstream_ids.sql" query = sql.SQL(read_file(q)).format( schema_a=sql.Identifier(schema_a), schema_b=sql.Identifier(schema_b), temp_table=sql.Identifier(temp_table), table_a=sql.Identifier(table_a), table_b=sql.Identifier(table_b), id_a=sql.Identifier(id_a), id_b=sql.Identifier(id_b), dnstr_ids_col=sql.Identifier(downstream_ids_col), ) # run each group in parallel func = partial(execute_parallel, query) n_processes = multiprocessing.cpu_count() - 1 pool = multiprocessing.Pool(processes=n_processes) pool.map(func, groups) pool.close() pool.join() # drop source table, rename new table, re-create indexes db[f"{schema_a}.{table_a}"].drop() db.execute(f"ALTER TABLE {schema_a}.{temp_table} RENAME TO {table_a}") create_indexes(f"{schema_a}.{table_a}") db.execute(f"ALTER TABLE {schema_a}.{table_a} ADD PRIMARY KEY ({id_a})") # make sure the table gets analyzed before running this again conn = db.engine.raw_connection() conn.set_isolation_level(0) cur = conn.cursor() cur.execute(f"VACUUM ANALYZE {schema_a}.{table_a}")
def test_create_table(): db = connect(URL, schema="pgdata") columns = [ Column('user_id', Integer, primary_key=True), Column('user_name', UnicodeText, nullable=False), Column('email_address', UnicodeText), Column('password', UnicodeText, nullable=False) ] employees = db.create_table("employees", columns) assert employees.table.exists()
def union(db_url, in_table, columns, out_table): """Union/merge overlapping records with equivalent values for provided columns """ db = pgdata.connect(db_url) sql = f"""CREATE TABLE {out_table} AS SELECT {columns}, (ST_Dump(ST_Union(geom))).geom as geom FROM {in_table} GROUP BY {columns} """ db.execute(sql)
def parallel_tiled(db_url, sql, tile, n_subs=1): """ Create a connection and execute query for specified tile n_subs is the number of places in the sql query that should be substituted by the tile name """ db = pgdata.connect(db_url, schema="designatedlands", multiprocessing=True) # As we are explicitly splitting up our job by tile and processing tiles # concurrently in individual connections we don't want the database to try # and manage parallel execution of these queries within these connections. # Turn off this connection's parallel execution: db.execute("SET max_parallel_workers_per_gather = 0") db.execute(sql, (tile + "%",) * n_subs)
def test_find_module_query(): db = connect(URL) db.execute(db.queries['utmzen2bcalb']) sql = """SELECT routines.routine_name FROM information_schema.routines LEFT JOIN information_schema.parameters ON routines.specific_name=parameters.specific_name WHERE routines.specific_schema='public' AND routines.routine_name = 'utmzen2bcalb' ORDER BY routines.routine_name, parameters.ordinal_position; """ assert db.query(sql).fetchone()['routine_name'] == 'utmzen2bcalb'
def execute_parallel(sql, wsg): """Execute sql for specified wsg using a non-pooled, non-parallel conn """ # specify multiprocessing when creating to disable connection pooling db = pgdata.connect(multiprocessing=True) conn = db.engine.raw_connection() cur = conn.cursor() # Turn off parallel execution for this connection, because we are # handling the parallelization ourselves cur.execute("SET max_parallel_workers_per_gather = 0") cur.execute(sql, (wsg, )) conn.commit() cur.close() conn.close()
def create_indexes(table): """create usual fwa indexes """ db = pgdata.connect() schema, table = db.parse_table_name(table) db.execute(f"""CREATE INDEX ON {schema}.{table} (linear_feature_id); CREATE INDEX ON {schema}.{table} (blue_line_key); CREATE INDEX ON {schema}.{table} (watershed_group_code); CREATE INDEX ON {schema}.{table} USING GIST (wscode_ltree); CREATE INDEX ON {schema}.{table} USING BTREE (wscode_ltree); CREATE INDEX ON {schema}.{table} USING GIST (localcode_ltree); CREATE INDEX ON {schema}.{table} USING BTREE (localcode_ltree); CREATE INDEX ON {schema}.{table} USING GIST (geom); """)
def add_upstream_ids(table_a, id_a, table_b, id_b, upstream_ids_col): """note upstream ids """ db = pgdata.connect() schema_a, table_a = db.parse_table_name(table_a) schema_b, table_b = db.parse_table_name(table_b) # ensure that any existing values get removed db.execute( f"ALTER TABLE {schema_a}.{table_a} DROP COLUMN IF EXISTS {upstream_ids_col}" ) temp_table = table_a + "_tmp" db[f"{schema_a}.{temp_table}"].drop() db.execute(f"CREATE TABLE {schema_a}.{temp_table} (LIKE {schema_a}.{table_a})") db.execute( f"ALTER TABLE {schema_a}.{temp_table} ADD COLUMN {upstream_ids_col} integer[]" ) groups = sorted( [ g[0] for g in db.query( f"SELECT DISTINCT watershed_group_code from {schema_a}.{table_a}" ) ] ) query = sql.SQL(read_file("sql/00_add_upstream_ids.sql")).format( schema_a=sql.Identifier(schema_a), schema_b=sql.Identifier(schema_b), temp_table=sql.Identifier(temp_table), table_a=sql.Identifier(table_a), table_b=sql.Identifier(table_b), id_a=sql.Identifier(id_a), id_b=sql.Identifier(id_b), upstr_ids_col=sql.Identifier(upstream_ids_col), ) # run each group in parallel func = partial(execute_parallel, query) n_processes = multiprocessing.cpu_count() - 1 pool = multiprocessing.Pool(processes=n_processes) pool.map(func, groups) pool.close() pool.join() # drop source table, rename new table, re-create indexes db[f"{schema_a}.{table_a}"].drop() db.execute(f"ALTER TABLE {schema_a}.{temp_table} RENAME TO {table_a}") create_indexes(f"{schema_a}.{table_a}") db.execute(f"ALTER TABLE {schema_a}.{table_a} ADD PRIMARY KEY ({id_a})") conn = db.engine.raw_connection() conn.set_isolation_level(0) cur = conn.cursor() cur.execute(f"VACUUM ANALYZE {schema_a}.{table_a}")
def report(point_table, point_id, barriers_table, dnstr_barriers_id): db = pgdata.connect() point_schema, point_table = db.parse_table_name(point_table) barriers_schema, barriers_table = db.parse_table_name(barriers_table) query = sql.SQL(read_file("sql/00_report.sql")).format( point_schema=sql.Identifier(point_schema), point_table=sql.Identifier(point_table), point_id=sql.Identifier(point_id), barriers_schema=sql.Identifier(barriers_schema), barriers_table=sql.Identifier(barriers_table), dnstr_barriers_id=sql.Identifier(dnstr_barriers_id), ) conn = db.engine.raw_connection() cur = conn.cursor() cur.execute(query) conn.commit()
def create_db(): """Create a fresh database """ util.log('Creating database %s' % config['db_url']) pgdata.create_db(config["db_url"]) db = pgdata.connect(config["db_url"]) db.execute("CREATE EXTENSION IF NOT EXISTS postgis") # the pgxn extension does not work on windows # note to the user to add lostgis functions manually with provided # .bat file as a reference if os.name == 'posix': db.execute("CREATE EXTENSION IF NOT EXISTS lostgis") else: util.log( 'Remember to add required lostgis functions to your new database', level=30, ) util.log('See scripts\lostgis_windows.bat as a guide', level=30)
def clip(db_url, in_table, clip_table, out_table): """Clip geometry of in_table by clip_table, writing output to out_table """ db = pgdata.connect(db_url) columns = ", ".join(["a." + c for c in db[in_table].columns if c != "geom"]) sql = f"""CREATE TABLE {out_table} AS SELECT {columns}, CASE WHEN ST_CoveredBy(a.geom, b.geom) THEN a.geom ELSE ST_Multi( ST_CollectionExtract( ST_Intersection(a.geom,b.geom), 3)) END AS geom FROM {in_table} AS a INNER JOIN {clip_table} AS b ON ST_Intersects(a.geom, b.geom) """ db.execute(sql)
def test_create_schema(): db = connect(URL, schema="pgdata") db.create_schema("pgdata")
def load(alias, force_download): """Download data, load to postgres """ db = pgdata.connect(config["db_url"]) sources = util.read_csv(config["source_csv"]) # filter sources based on optional provided alias and ignoring excluded if alias: sources = [s for s in sources if s["alias"] == alias] if not sources: raise ValueError('Alias %s does not exist' % alias) sources = [s for s in sources if s["exclude"] != 'T'] # process sources where automated downloads are avaiable load_commands = [] for source in [s for s in sources if s["manual_download"] != 'T']: # handle BCGW downloads if urlparse(source["url"]).hostname == 'catalogue.data.gov.bc.ca': file, layer = download.download_bcgw( source["url"], config["dl_path"], email=config["email"], force_download=force_download, ) # handle all other downloads (zipfiles only) else: file, layer = download.download_non_bcgw( source['url'], config['dl_path'], source['file_in_url'], source['layer_in_file'], force_download=force_download, ) load_commands.append( db.ogr2pg( file, in_layer=layer, out_layer=source["input_table"], sql=source["query"], cmd_only=True, )) # process manually downloaded sources for source in [s for s in sources if s["manual_download"] == 'T']: file = os.path.join(config['dl_path'], source["file_in_url"]) if not os.path.exists(file): raise Exception(file + " does not exist, download it manually") load_commands.append( db.ogr2pg( file, in_layer=source['layer_in_file'], out_layer=source["input_table"], sql=source["query"], cmd_only=True, )) # run all ogr commands in parallel util.log('Loading source data to database.') # https://stackoverflow.com/questions/14533458/python-threading-multiple-bash-subprocesses processes = [subprocess.Popen(cmd, shell=True) for cmd in load_commands] for p in processes: p.wait() # log ogr statements for debugging #for cmd in load_commands: # util.log(cmd) # subprocess.call(cmd, shell=True) # create tiles layer util.log('Creating tiles layer') db.execute(db.queries["create_tiles"])
def test_list_schema(): db = connect(URL, schema="pgdata") assert set(["information_schema", "pgdata", "public"]) <= set(db.schemas)
def dump_aggregate(config, new_layer_name): """ UNSUPPORTED test aggregation of designatedlands over tile boundaries Output data is aggregated across map tiles to remove gaps introduced in tiling of the sources. Aggregation is by distinct 'designation' in the output layer, and is run separately for each designation for speed. To dump data aggregated by 'category' or some other field, build and run your own ogr2ogr command based on below queries. This command is unsupported, aggregation does not quite remove gaps across all records and is very slow. Use mapshaper to aggregate outputs from the dump command (convert to shapefile first) eg: $ mapshaper designatedlands.shp \ -clean snap-interval=0.01 \ -dissolve designatio copy-fields=category \ -explode \ -o dl_clean.shp """ # config = util.read_config(config) db = pgdata.connect(config["db_url"], schema="public") util.log('Aggregating %s to %s' % (config['out_table'], new_layer_name)) # find all non-null designations designations = [ d for d in db[config['out_table']].distinct('designation') if d ] db[new_layer_name].drop() sql = """CREATE TABLE {new_layer_name} AS SELECT designation, category, bc_boundary, geom FROM {out_table} LIMIT 0""".format( new_layer_name=new_layer_name, out_table=config['out_table'] ) db.execute(sql) # iterate through designations to speed up the aggregation for designation in designations: util.log('Adding %s to %s' % (designation, new_layer_name)) # dump records entirely within a tile sql = """ INSERT INTO {new_layer_name} (designation, category, bc_boundary, geom) SELECT dl.designation, dl.category, dl.bc_boundary, dl.geom FROM {t} dl INNER JOIN tiles ON dl.map_tile = tiles.map_tile WHERE dl.designation = %s AND ST_Coveredby(dl.geom, ST_Buffer(tiles.geom, -.01)) """.format( t=config['out_table'], new_layer_name=new_layer_name ) db.execute(sql, (designation,)) # aggregate cross-tile records # Notes: # - expansion/contraction buffering of 3mm to remove gaps between tiles # - ST_Buffer of 0 on ST_Collect is much faster than ST_Union # - ST_Collect is a bit less robust, it requires # ST_RemoveRepeatedPoints to complete successfully on sources that # appear to come from rasters (mineral_reserve, ogma_legal) sql = """ INSERT INTO {new_layer_name} (designation, category, bc_boundary, geom) SELECT designation, category, bc_boundary, (ST_Dump(ST_Buffer(geom, -.003))).geom as geom FROM ( SELECT dl.designation, dl.category, dl.bc_boundary, ST_Buffer( ST_RemoveRepeatedPoints( ST_SnapToGrid( ST_Collect( ST_Buffer(dl.geom, .003)), .001), .01), 0) as geom FROM designatedlands dl INNER JOIN tiles ON dl.map_tile = tiles.map_tile WHERE dl.designation = %s AND NOT ST_Coveredby(dl.geom, ST_Buffer(tiles.geom, -.01)) GROUP BY dl.designation, dl.category, dl.bc_boundary) as foo """.format( t=config['out_table'], new_layer_name=new_layer_name ) db.execute(sql, (designation,)) util.log('Dumping %s to file %s' % (new_layer_name, config['out_file'])) db.pg2ogr( "SELECT * from " + new_layer_name, config['out_format'], config['out_file'], new_layer_name, )
def test_insert_one(): db = connect(URL) table = db["pgdata.employees"] table.insert(DATA[0])
def test_null_table(): db = connect(URL) db["table_that_does_not_exist"].drop() assert db["table_that_does_not_exist"]._is_dropped is True
def parallel_query(id): sql = "SELECT user_name FROM pgdata.employees WHERE user_id = %s" db = connect(URL, multiprocessing=True) db.engine.execute(sql, (id,))
def test_find_sqlpath_query(): db = connect(URL, sql_path='tests/sql') assert db.queries['test'] == "SELECT 1 as test"
def test_query_keys(): db = connect(URL) sql = "SELECT user_name FROM pgdata.employees WHERE user_id = %s" assert db.engine.execute(sql, (1,)).keys() == ['user_name']
def test_query_params_1(): db = connect(URL) sql = "SELECT user_name FROM pgdata.employees WHERE user_id = %s" r = db.query(sql, (1,)).fetchall() assert r[0][0] == 'Fred' assert r[0]["user_name"] == 'Fred'
def test_build_query(): db = connect(URL) sql = "SELECT $UserName FROM pgdata.employees WHERE $UserId = 1" lookup = {"UserName": "******", "UserId": "user_id"} new_sql = db.build_query(sql, lookup) assert new_sql == "SELECT user_name FROM pgdata.employees WHERE user_id = 1"
def test_get_table_cross_schema(): db = connect(URL) assert db["pgdata.employees"] is not None
def test_distinct(): db = connect(URL, schema="pgdata") users = [r[0] for r in db["employees"].distinct('user_name')] assert len(users) == 3
def test_connect(): db = connect(URL, schema="pgdata") assert db.url == "postgresql://*****:*****@localhost:5432/pgdata"
def test_insert_many(): db = connect(URL) table = db["pgdata.employees"] table.insert(DATA[1:])
def test_drop_schema(): db = connect(URL, schema="pgdata") db.drop_schema("pgdata", cascade=True)
def test_create_index(): db = connect(URL, schema="pgdata") indexname = 'employees_user_name_idx' db['employees'].create_index(['user_name'], indexname) indexes = db['employees'].indexes.keys() assert indexname in indexes
def __init__(self, config_file=None): LOG.info("Initializing designatedlands") # load default config self.config = DEFAULT_CONFIG.copy() # if provided with a config file, replace config values with those present in # thie config file if config_file: if not os.path.exists(config_file): raise ConfigValueError(f"File {config_file} does not exist") self.read_config(config_file) # set default n_processes to the number of cores available minus one if self.config["n_processes"] == -1: self.config["n_processes"] = multiprocessing.cpu_count() - 1 # don't try and use more cores than are available elif self.config["n_processes"] > multiprocessing.cpu_count(): self.config["n_processes"] = multiprocessing.cpu_count() self.db = pgdata.connect(self.config["db_url"]) self.db.ogr_string = f"PG:host={self.db.host} user={self.db.user} dbname={self.db.database} password={self.db.password} port={self.db.port}" # define valid restriction classes and assign raster values self.restriction_lookup = { "PROTECTED": 5, "FULL": 4, "HIGH": 3, "MEDIUM": 2, "LOW": 1, "NONE": 0, } # load sources from csv self.read_sources() # define bounds manually self.bounds = [273287.5, 367687.5, 1870687.5, 1735887.5] width = max( int( ceil( (self.bounds[2] - self.bounds[0]) / float(self.config["resolution"]) ) ), 1, ) height = max( int( ceil( (self.bounds[3] - self.bounds[1]) / float(self.config["resolution"]) ) ), 1, ) self.raster_profile = { "count": 1, "crs": "EPSG:3005", "width": width, "height": height, "transform": Affine( self.config["resolution"], 0, self.bounds[0], 0, -self.config["resolution"], self.bounds[3], ), "nodata": 255, }
def bc2pg( dataset, db_url, table, schema, query, pagesize, max_workers, dim, fid, verbose, quiet, ): """Download a DataBC WFS layer to postgres - an ogr2ogr wrapper. \b $ bcdata bc2pg bc-airports --db_url postgresql://postgres:postgres@localhost:5432/postgis The default target database can be specified by setting the $DATABASE_URL environment variable. https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls """ # for this command, default to INFO level logging # (echo the ogr2ogr commands by default) verbosity = verbose - quiet log_level = max(10, 20 - 10 * verbosity) logging.basicConfig(stream=sys.stderr, level=log_level) log = logging.getLogger(__name__) src = bcdata.validate_name(dataset) src_schema, src_table = [i.lower() for i in src.split(".")] if not schema: schema = src_schema if not table: table = src_table # always upper if fid: fid = fid.upper() # create schema if it does not exist conn = pgdata.connect(db_url) if schema not in conn.schemas: click.echo("Schema {} does not exist, creating it".format(schema)) conn.create_schema(schema) # build parameters for each required request param_dicts = bcdata.define_request( dataset, query=query, sortby=fid, pagesize=pagesize ) # run the first request / load payload = urlencode(param_dicts[0], doseq=True) url = bcdata.WFS_URL + "?" + payload db = parse_db_url(db_url) db_string = "PG:host={h} user={u} dbname={db} password={pwd}".format( h=db["host"], u=db["user"], db=db["database"], pwd=db["password"] ) # create the table command = [ "ogr2ogr", "-lco", "OVERWRITE=YES", "-lco", "SCHEMA={}".format(schema), "-lco", "GEOMETRY_NAME=geom", "-f", "PostgreSQL", db_string, "-t_srs", "EPSG:3005", "-nln", table, url, ] if dim: command = command + ["-dim", dim] if fid: command = command + ["-lco", "FID={}".format(fid)] # for speed with big loads - unlogged, no spatial index if len(param_dicts) > 1: command = command + ["-lco", "UNLOGGED=ON"] command = command + ["-lco", "SPATIAL_INDEX=NONE"] log.info(" ".join(command)) subprocess.run(command) # write to additional separate tables if data is larger than 10k recs if len(param_dicts) > 1: commands = [] for n, paramdict in enumerate(param_dicts[1:]): # create table to load to (so types are identical) sql = """ CREATE TABLE {schema}.{table}_{n} (LIKE {schema}.{table} INCLUDING ALL) """.format( schema=schema, table=table, n=str(n) ) conn.execute(sql) payload = urlencode(paramdict, doseq=True) url = bcdata.WFS_URL + "?" + payload command = [ "ogr2ogr", "-update", "-append", "-f", "PostgreSQL", db_string + " active_schema=" + schema, "-t_srs", "EPSG:3005", "-nln", table + "_" + str(n), url, ] if dim: command = command + ["-dim", dim] commands.append(command) # https://stackoverflow.com/questions/14533458 pool = Pool(max_workers) with click.progressbar( pool.imap(partial(call), commands), length=len(param_dicts) ) as bar: for returncode in bar: if returncode != 0: click.echo("Command failed: {}".format(returncode)) # once loaded, combine & drop for n, _x in enumerate(param_dicts[1:]): sql = """INSERT INTO {schema}.{table} SELECT * FROM {schema}.{table}_{n}""".format( schema=schema, table=table, n=str(n) ) conn.execute(sql) sql = "DROP TABLE {}.{}_{}".format(schema, table, n) conn.execute(sql) conn.execute("ALTER TABLE {}.{} SET LOGGED".format(schema, table)) log.info("Indexing geometry") conn[schema + "." + table].create_index_geom() # deal with primary key - becaue loading to many tables, # ogc_fid is not unique if not fid: sql = "ALTER TABLE {}.{} DROP COLUMN ogc_fid".format(schema, table) conn.execute(sql) sql = "ALTER TABLE {}.{} ADD COLUMN ogc_fid SERIAL PRIMARY KEY".format( schema, table ) conn.execute(sql) log.info( "Load of {} to {} in {} complete".format(src, schema + "." + table, db_url) )
def test_tables_in_schema(): db = connect(URL) tables = db.tables_in_schema("pgdata") assert set(tables) == set(["employees"])
def process(resume, force_preprocess, tiles): """Create output designatedlands tables """ db = pgdata.connect(config["db_url"], schema="public") # run required preprocessing, tile, attempt to clean inputs geoutil.preprocess(db, config['source_csv'], force=force_preprocess) geoutil.tile_sources(db, config['source_csv'], force=force_preprocess) geoutil.clean_and_agg_sources(db, config['source_csv'], force=force_preprocess) # parse the list of tiles tilelist = geoutil.parse_tiles(db, tiles) # create target tables if not resuming from a bailed process if not resume: # create output tables db.execute( db.build_query( db.queries["create_outputs_prelim"], {"table": config['out_table']}, )) # filter sources - use only non-exlcuded sources with hierarchy > 0 sources = [ s for s in util.read_csv(config['source_csv']) if s['hierarchy'] != 0 and s["exclude"] != 'T' ] # To create output table with overlaps, combine all source data # (tiles argument does not apply, we could build a tile query string but # it seems unnecessary) for source in sources: util.log("Inserting %s into preliminary output overlap table" % source["tiled_table"]) sql = db.build_query( db.queries["populate_output_overlaps"], { "in_table": source["tiled_table"], "out_table": config['out_table'] + "_overlaps_prelim", }, ) db.execute(sql) # To create output table with no overlaps, more processing is required # In case of bailing during tests/development, `resume` option is available # to enable resumption of processing at specified hierarchy number if resume: p_sources = [s for s in sources if int(s["hierarchy"]) >= int(resume)] else: p_sources = sources # The tiles layer will fill in gaps between sources (so all BC is included # in output). To do this, first match schema of tiles to other sources db.execute("ALTER TABLE tiles ADD COLUMN IF NOT EXISTS id integer") db.execute("UPDATE tiles SET id = tile_id") db.execute("ALTER TABLE tiles ADD COLUMN IF NOT EXISTS designation text") # Next, add simple tiles layer definition to sources list p_sources.append({"cleaned_table": "tiles", "category": None}) # iterate through all sources for source in p_sources: sql = db.build_query( db.queries["populate_output"], { "in_table": source["cleaned_table"], "out_table": config['out_table'] + "_prelim", }, ) # determine which specified tiles are present in source layer src_tiles = set( geoutil.get_tiles(db, source["cleaned_table"], tile_table='tiles')) if tilelist: tiles = set(tilelist) & src_tiles else: tiles = src_tiles if tiles: util.log("Inserting %s into preliminary output table" % source["cleaned_table"]) # for testing, run only one process and report on tile if config['n_processes'] == 1: for tile in tiles: util.log(tile) db.execute(sql, (tile + "%", ) * 2) else: func = partial(geoutil.parallel_tiled, db.url, sql, n_subs=2) pool = multiprocessing.Pool(processes=config['n_processes']) pool.map(func, tiles) pool.close() pool.join() else: util.log("No tiles to process") # create marine-terrestrial layer if 'bc_boundary' not in db.tables: geoutil.create_bc_boundary(db, config['n_processes']) # overlay output tables with marine-terrestrial definition for table in [config['out_table'], config['out_table'] + "_overlaps"]: util.log('Cutting %s with marine-terrestrial definition' % table) geoutil.intersect( db, table + "_prelim", "bc_boundary", table, config['n_processes'], tiles, ) tidy_designations(db, sources, "cleaned_table", config['out_table']) tidy_designations(db, sources, "tiled_table", config['out_table'] + "_overlaps")
import os import shutil import tempfile import unittest import fiona import pgdata URL = 'postgresql://*****:*****@localhost:5432/pgdata' DB = pgdata.connect(URL) DB.execute('CREATE SCHEMA IF NOT EXISTS pgdata') DATA_1 = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') AIRPORTS = os.path.join(DATA_1, 'bc_airports.json') # also test a path with spaces DATA_2 = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data space') AIRPORTS_2 = os.path.join(DATA_2, 'bc_airports_one.json') class ogrpg(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp() self.spaced_dir = tempfile.mkdtemp("spa ced") def test_ogr2pg(self): db = DB db.ogr2pg(AIRPORTS, in_layer='bc_airports', out_layer='bc_airports',
def setup(): create_db(URL) db = connect(URL) db.execute("CREATE EXTENSION IF NOT EXISTS postgis")