def test_exclude_filtering_nodes_and_relations(helsinki_pbf): from pyrosm import OSM # Initialize the reader osm = OSM(helsinki_pbf) custom_filter = {"amenity": ["library"]} gdf = osm.get_data_by_custom_criteria( custom_filter, filter_type="exclude", ) assert gdf.shape == (1081, 37) assert "library" not in gdf["amenity"].unique().tolist() # There should be nodes, ways and relations assert gdf["osm_type"].unique().tolist() == ["node", "way", "relation"] # Test other way around gdf = osm.get_data_by_custom_criteria( custom_filter, filter_type="keep", ) assert gdf.shape == (7, 23) assert gdf["amenity"].unique().tolist() == ["library"] # There should be nodes and ways (no relations) assert gdf["osm_type"].unique().tolist() == ["node", "way"]
def test_reading_with_custom_filters_selecting_specific_osm_element( helsinki_pbf): from pyrosm import OSM from geopandas import GeoDataFrame # Get first all data osm = OSM(filepath=helsinki_pbf) # Test getting only relations # --------------------------- filtered = osm.get_data_by_custom_criteria( custom_filter={'building': True}, filter_type="keep", keep_nodes=False, keep_ways=False, keep_relations=True) assert isinstance(filtered, GeoDataFrame) # Now should only have 'relation' osm_type assert len(filtered['osm_type'].unique()) == 1 assert filtered['osm_type'].unique()[0] == 'relation' assert len(filtered) == 66 # Test getting only ways # --------------------------- filtered = osm.get_data_by_custom_criteria( custom_filter={'building': True}, filter_type="keep", keep_nodes=False, keep_ways=True, keep_relations=False) assert isinstance(filtered, GeoDataFrame) # Now should only have 'way' osm_type assert len(filtered['osm_type'].unique()) == 1 assert filtered['osm_type'].unique()[0] == 'way' assert len(filtered) == 422 # Test getting only nodes # --------------------------- filtered = osm.get_data_by_custom_criteria( custom_filter={'building': True}, filter_type="keep", keep_nodes=True, keep_ways=False, keep_relations=False) assert isinstance(filtered, GeoDataFrame) # Now should only have 'node' osm_type assert len(filtered['osm_type'].unique()) == 1 assert filtered['osm_type'].unique()[0] == 'node' assert len(filtered) == 36
def test_reading_with_custom_filters_with_excluding(test_pbf): from pyrosm import OSM from shapely.geometry import Polygon from geopandas import GeoDataFrame # Get first all data osm = OSM(filepath=test_pbf) gdf_all = osm.get_buildings() # Find out all 'building' tags cnts = gdf_all['building'].value_counts() n = len(gdf_all) for filter_, cnt in cnts.items(): # Use the custom filter filtered = osm.get_data_by_custom_criteria( custom_filter={'building': [filter_]}, filter_type="exclude") assert isinstance(filtered, GeoDataFrame) assert isinstance(filtered.loc[0, "geometry"], Polygon) assert len(filtered) == n - cnt # Now should not have the filter_ in buildings assert filter_ not in filtered["building"].unique() required_cols = ['building', 'id', 'timestamp', 'version', 'geometry'] for col in required_cols: assert col in filtered.columns
def get_osm_gata(protobuf: str) -> typing.Tuple: """get osm-data from protobuf for parks, roads, industrials areas""" osm = OSM(protobuf) msk_parks = osm.get_data_by_custom_criteria(custom_filter={ 'leisure': ['park', 'garden'], 'natural': ['wood'] }, filter_type='keep', keep_nodes=False, keep_ways=True, keep_relations=True) msk_parks = msk_parks[msk_parks.to_crs("EPSG:3395").area > 100000] drive_net = osm.get_data_by_custom_criteria( custom_filter={"highway": ["trunk", "primary", "secondary"]}) indust = osm.get_data_by_custom_criteria( custom_filter={"landuse": ["industrial"]}) return msk_parks.to_crs(CRS), drive_net.to_crs(CRS), indust.to_crs(CRS)
def test_adding_extra_attribute(helsinki_pbf): from pyrosm import OSM from geopandas import GeoDataFrame osm = OSM(filepath=helsinki_pbf) gdf = osm.get_data_by_custom_criteria({"highway": True}) extra_col = "wikidata" extra = osm.get_data_by_custom_criteria({"highway": True}, extra_attributes=[extra_col]) # The extra should have one additional column compared to the original one assert extra.shape[1] == gdf.shape[1] + 1 # Should have same number of rows assert extra.shape[0] == gdf.shape[0] assert extra_col in extra.columns assert len(extra[extra_col].dropna().unique()) > 0 assert isinstance(gdf, GeoDataFrame)
def test_using_incorrect_booleans(test_pbf): from pyrosm import OSM osm = OSM(filepath=test_pbf) custom_filter = {"building": ["retail"]} incorrect_bool = "foo" # Test that passing incorrect data works as should try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, keep_nodes=incorrect_bool) except ValueError as e: if "'keep_nodes' should be boolean type: True or False" in str(e): pass else: raise e try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, keep_ways=incorrect_bool) except ValueError as e: if "'keep_ways' should be boolean type: True or False" in str(e): pass else: raise e try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, keep_relations=incorrect_bool) except ValueError as e: if "'keep_relations' should be boolean type: True or False" in str(e): pass else: raise e try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, keep_relations=False, keep_ways=False, keep_nodes=False) except ValueError as e: if "At least on of the following parameters should be True" in str(e): pass else: raise e
def test_using_incorrect_filter(test_pbf): from pyrosm import OSM osm = OSM(filepath=test_pbf) # Test that passing incorrect data works as should # 1. custom_filter = None try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter) except ValueError as e: if "should be a Python dictionary" in str(e): pass else: raise e custom_filter = {"building": [1]} # 2. try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter) except ValueError as e: if "string" in str(e): pass else: raise e custom_filter = {"building": ["correct_string", 1]} # 3. try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter) except ValueError as e: if "string" in str(e): pass else: raise e # 4. custom_filter = {0: ["residential"]} try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter) except ValueError as e: if "string" in str(e): pass else: raise e
def test_using_two_level_custom_filter(helsinki_region_pbf): from pyrosm import OSM osm = OSM(filepath=helsinki_region_pbf) osm_keys = ["building"] custom_filter = {"amenity": ["school"]} gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, osm_keys_to_keep=osm_keys) assert gdf.shape == (72, 25) # Now 'building' and 'amenity' should not have NaNs assert not gdf["building"].hasnans assert not gdf["amenity"].hasnans
def test_using_incorrect_filter_type(test_pbf): from pyrosm import OSM osm = OSM(filepath=test_pbf) custom_filter = {"building": ["retail"]} filter_type = "incorrect_test" # Test that passing incorrect data works as should try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, filter_type=filter_type) except ValueError as e: if "should be either 'keep' or 'exclude'" in str(e): pass else: raise e
def test_using_incorrect_osm_keys(test_pbf): from pyrosm import OSM osm = OSM(filepath=test_pbf) osm_keys = 1 custom_filter = {"building": ["retail"]} # Test that passing incorrect data works as should try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, osm_keys_to_keep=osm_keys) except ValueError as e: if "'osm_keys_to_keep' -parameter should be of type str or list." in str( e): pass else: raise e
def test_using_incorrect_tags(test_pbf): from pyrosm import OSM osm = OSM(filepath=test_pbf) # Incorrect tags # -------------- tags_as_columns = [1] custom_filter = {"building": ["retail"]} # Test that passing incorrect data works as should try: gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, tags_as_columns=tags_as_columns) except ValueError as e: if "All tags listed in 'tags_as_columns' should be strings" in str(e): pass else: raise e
def test_reading_custom_from_area_having_none(helsinki_pbf): from pyrosm import OSM from geopandas import GeoDataFrame # Bounding box for area that does not have any data bbox = [24.940514, 60.173849, 24.942, 60.175892] osm = OSM(filepath=helsinki_pbf, bounding_box=bbox) # The tool should warn if no buildings were found with pytest.warns(UserWarning) as w: gdf = osm.get_data_by_custom_criteria({"highway": ["primary"]}) # Check the warning text if "could not find any OSM data" in str(w): pass # Result should be None assert gdf is None
def test_using_multiple_filters(helsinki_pbf): from pyrosm import OSM from geopandas import GeoDataFrame osm = OSM(filepath=helsinki_pbf) gdf = osm.get_data_by_custom_criteria({ "shop": ["alcohol"], "amenity": ["pub"] }) # shop and amenity columns should only contain alcohol and pub as requested # (in addition to None values) shop = gdf["shop"].unique().tolist() shop = [item for item in shop if isinstance(item, str)] amenity = gdf["amenity"].unique().tolist() amenity = [item for item in amenity if isinstance(item, str)] assert isinstance(gdf, GeoDataFrame) assert shop == ["alcohol"] assert amenity == ["pub"] assert gdf.shape == (59, 32)
def test_parsing_osm_with_custom_filter_by_including_tags(test_pbf): from pyrosm import OSM from geopandas import GeoDataFrame import pyproj osm = OSM(filepath=test_pbf) # Keep only building as column tags_as_columns = ["building"] # Get all buildings that are "retail" custom_filter = {"building": ["retail"]} filter_type = "keep" osm_type = "building" gdf = osm.get_data_by_custom_criteria(custom_filter=custom_filter, filter_type=filter_type, osm_keys_to_keep=osm_type, tags_as_columns=tags_as_columns) assert isinstance(gdf, GeoDataFrame) # Only following columns should exist after specifying tags_as_columns allowed_columns = [ "geometry", "tags", "building", "id", "osm_type", "version", "timestamp", "changeset" ] for col in gdf.columns: assert col in allowed_columns # Building columns should not have any "residential" tags assert len(gdf["building"].unique()) == 1 assert gdf["building"].unique()[0] == "retail" # Required keys required = ['id', 'geometry'] for col in required: assert col in gdf.columns # Test shape assert len(gdf) == 2 assert gdf.crs == pyproj.CRS.from_epsg(4326)
def test_custom(test_pbf): from pyrosm import OSM from geopandas import GeoDataFrame osm = OSM(test_pbf) gdf = osm.get_data_by_custom_criteria({"highway": ["secondary"]}) assert isinstance(gdf, GeoDataFrame)
def test_custom_filters_with_custom_keys(helsinki_region_pbf): from pyrosm import OSM from geopandas import GeoDataFrame # Get first all data osm = OSM(filepath=helsinki_region_pbf) # Test reading public transport related data filtered = osm.get_data_by_custom_criteria( custom_filter={'public_transport': True}, filter_type="keep", ) assert isinstance(filtered, GeoDataFrame) assert len(filtered) == 5542 # Test a more complicated query # ----------------------------- # Test reading all transit related data (bus, trains, trams, metro etc.) # Exclude nodes (not keeping stops, etc.) routes = [ "bus", "ferry", "railway", "subway", "train", "tram", "trolleybus" ] rails = ["tramway", "light_rail", "rail", "subway", "tram"] # 'express' comes with routes bus = ['yes', "express"] transit = osm.get_data_by_custom_criteria(custom_filter={ 'route': routes, 'railway': rails, 'bus': bus }, filter_type="keep", keep_nodes=False) required_columns = ["railway", "bus", "route"] for col in required_columns: assert col in transit.columns # Check individual counts correct_counts = {'railway': 1456, 'route': 824, 'bus': 79} for col in required_columns: cnt = len(transit[col].dropna()) correct = correct_counts[col] assert cnt == correct, f"Incorrect count for {col}. " \ f"Should have {correct}, found {cnt}." # Ensure that the data contains only data specified in the filters unique_route = transit["route"].unique() for v in unique_route: if v is None: continue elif str(v) == "nan": continue assert v in routes unique_rails = transit["railway"].unique() for v in unique_rails: if v is None: continue elif str(v) == "nan": continue assert v in rails unique_bus = transit["bus"].unique() for v in unique_bus: if v is None: continue elif str(v) == "nan": continue assert v in bus assert isinstance(transit, GeoDataFrame) assert len(transit) == 2357 # When using custom filters all records should have a value # at least on one of the attributes specified in the custom_filter selected = transit[required_columns] # Try dropping out rows with NaNs on all columns no_nans = selected.dropna(subset=required_columns, how="all") assert selected.shape == no_nans.shape
class ProcessOSM: """ Processing Class """ def __init__(self, inputs, output, prefix, ext, themes, features): self.inputs = inputs self.output = output self.themes = themes self.features = features self.workers = 1 self.clip_data = None self.clip_gdf = None self.osm = None self.keep = False # False removes invalid geometries self.show_warning = False self.prefix = prefix self.ext = ext self.bbox = None self.layer = None def process(self): """ Handle general multiprocessing workflow. """ # if self.show_warning: # warnings.filterwarnings("ignore") # warnings.filterwarnings("ignore") begin_time = time.time() if self.clip_data is not None: if self.layer is None: self.clip_gdf = gpd.read_file(self.clip_data) else: try: self.clip_gdf = gpd.read_file(self.clip_data, driver="FileGDB", layer=self.layer) except ValueError as e: print(e) exit() geo = self.clip_gdf.geometry.unary_union self.osm = OSM(self.inputs, geo) elif self.bbox is not None: self.osm = OSM(self.inputs, self.bbox) # Create Clip GDF from bbox coordinate p = Polygon([(self.bbox[0], self.bbox[1]), (self.bbox[0], self.bbox[3]), (self.bbox[2], self.bbox[3]), (self.bbox[2], self.bbox[1])]) self.clip_gdf = gpd.GeoDataFrame({'geometry': [p]}, geometry='geometry') self.clip_gdf.set_crs(epsg=4326, inplace=True) else: self.osm = OSM(self.inputs) # self.process_key(self.themes[8]) # for theme in self.themes: # self.process_key(theme) futures = [] with ProcessPoolExecutor(max_workers=self.workers) as executor: for theme in self.themes: futures.append(executor.submit(self.process_key, theme)) # for f in futures: # print(f, 'running?', f.running()) for x in as_completed(futures): #for f in futures: # print(f, 'running?', f.running()) if x.exception() is not None: print(f'Future Exception {x.exception()}') # Kill remaining child processes kill_child_processes(os.getpid()) # Plagued by general memory errors from pyrosm # Consumes all memory ram/swap then hangs machine # you can't cancel an active job # shutdown only cancels queued tasks not active tasks # only thing left to do is kill processes if there is an error #print('cancel') #for f in futures: # f.cancel() # executor.shutdown(wait=False) # for f in futures: # f.cancel() # print(f, 'running?', f.running()) # if f.running(): # f.cancel() # print('Cancelled? ', f.cancelled()) # exit() #try: # print(x.result()) #except Exception as exc: # print(f'generated an exception: {exc}') # exit() total_time = time.time() - begin_time print('Done after {} seconds.'.format(round(total_time, 0))) def process_key(self, theme): """ Workflow for processing OSM data """ begin_time = time.time() geod = { 'point': ['Point', 'MultiPoint'], 'line': ['LineString', 'MultiLineString'], 'polygon': ['Polygon', 'MultiPolygon'] } print(f'Processing PBF for {theme}') try: gdf = self.osm.get_data_by_custom_criteria( osm_keys_to_keep=theme, custom_filter={theme: True}) except Exception as e: print('Bad Mojo') print(f'Exception Exit {e} theme :{theme}') raise #RuntimeError(f'Exception Exit {e}') #exit() print('Done PBF for {} after {} seconds.'.format( theme, round(time.time() - begin_time, 0))) if gdf is not None: theme_time = time.time() gdf['geom_type'] = gdf.geometry.geom_type for geo in self.features: print(f'Processing {theme}:{geo}') theme_time = time.time() gdf_select = gdf[gdf["geom_type"].isin(geod[geo])] if not gdf_select.empty: if self.clip_gdf is not None: try: # Remove bad geometries in OSM file before clipping if not self.keep: start = gdf_select.shape[0] gdf_select = gdf_select[ gdf_select.geometry.is_valid] if start != gdf_select.shape[0]: end = start - gdf_select.shape[0] print( f'\tRemoving {end} geometries from {theme}:{geo}' ) gdp_clip = gpd.clip(gdf_select, self.clip_gdf) print('{}:{} shape {}'.format( theme, geo, gdp_clip.shape)) print( 'Done Geodataframe processing: {}:{} after {} seconds .' .format(theme, geo, round(time.time() - theme_time, 0))) self.write_data(gdp_clip, theme, geo) except GEOSException: print( f'Unable to clip {theme}:{geo} exporting unclipped' ) self.write_data(gdf_select, theme, geo) continue else: print('{}:{} shape {}'.format(theme, geo, gdf_select.shape)) print( 'Done Geodataframe processing: {}:{} after {} seconds .' .format(theme, geo, round(time.time() - theme_time, 0))) self.write_data(gdf_select, theme, geo) else: print(f'\tEmpty dataframe {theme}:{geo}') else: print(f'\tEmpty theme {theme}') total_time = time.time() - begin_time print('Done {} after {} seconds.'.format(theme, round(total_time, 0))) return theme # noinspection SpellCheckingInspection def write_data(self, gdf_write, theme, geo): begin_time = time.time() if self.ext == 'shp': outputfile_shp = os.path.join(self.output, f'{self.prefix}_{theme}_{geo}.shp') gdf_write.to_file(outputfile_shp) elif self.ext == 'geojson': outputfile_gjson = os.path.join( self.output, f'{self.prefix}_{theme}_{geo}.geojson') gdf_write.to_file(outputfile_gjson, driver='GeoJSON') else: outputfile_gpkg = os.path.join( self.output, f'{self.prefix}_{theme}_{geo}.gpkg') gdf_write.to_file(outputfile_gpkg, layer='{}_{}'.format(theme, geo), driver="GPKG") print('Done {}:{} in {} seconds to file.'.format( theme, geo, round(time.time() - begin_time, 0)))