def get_dataset(zip): """Gets singular geospatial dataset and layer for analysis. Validates rules: - There must be only one data source (.shp or .gdb) in the zip file. - There must be only one data layer in that data source. - The data source must contain the required files (.prj for shapefile; .dbf is not used so not required) Parameters ---------- zip : open ZipFile Returns ------- (str, str) tuple of geospatial file within zip file, name of layer """ files = set(list_files(zip)) geo_files = [f for f in list_files(zip) if f.endswith(".shp") or f.endswith(".gdb")] num_files = len(geo_files) if num_files == 0: log.error("Upload zip file does not contain shp or FGDB files") raise ValueError("zip file must include a shapefile or FGDB") if num_files > 1: log.error( f"Upload zip file contains {num_files} shp or FGDB files:\n{geo_files}" ) raise ValueError("zip file must include only one shapefile or FGDB") filename = geo_files[0] if filename.endswith(".shp"): missing = [] for ext in (".prj", ".shx"): if not (filename.replace(".shp", ext) in files): missing.append(ext) if missing: log.error(f"Upload zip file contains .shp but not {','.join(missing)}") raise ValueError("zip file must include .shp, .prj, and .shx files") # Validate that dataset is a polygon and has only a single layer layers = pio.list_layers(f"/vsizip/{zip.fp.name}/{filename}") if layers.shape[0] > 1: log.error(f"Upload data source contains multiple data layers\n{layers}") raise ValueError("data source must contain only one data layer") if "Polygon" not in layers[0, 1]: log.error(f"Upload data source is not a polygon: {layers[0,1]}") raise ValueError("data source must be a Polygon type") return filename, layers[0, 0]
def test_list_layers(naturalearth_lowres, naturalearth_lowres_vsi, test_fgdb_vsi): assert array_equal(list_layers(naturalearth_lowres), [["naturalearth_lowres", "Polygon"]]) assert array_equal(list_layers(naturalearth_lowres_vsi), [["naturalearth_lowres", "Polygon"]]) # Measured 3D is downgraded to 2.5D during read # Make sure this warning is raised with pytest.warns( UserWarning, match=r"Measured \(M\) geometry types are not supported"): fgdb_layers = list_layers(test_fgdb_vsi) assert len(fgdb_layers) == 7 # Make sure that nonspatial layer has None for geometry assert array_equal(fgdb_layers[0], ["basetable_2", None]) # Confirm that measured 3D is downgraded to 2.5D during read assert array_equal(fgdb_layers[3], ["test_lines", "2.5D MultiLineString"]) assert array_equal(fgdb_layers[6], ["test_areas", "2.5D MultiPolygon"])
def test_read_layer(test_fgdb_vsi): layers = list_layers(test_fgdb_vsi) # The first layer is read by default (NOTE: first layer has no features) df = read_dataframe(test_fgdb_vsi, read_geometry=False, max_features=1) df2 = read_dataframe(test_fgdb_vsi, layer=layers[0][0], read_geometry=False, max_features=1) assert_frame_equal(df, df2) # Reading a specific layer should return that layer. # Detected here by a known column. df = read_dataframe(test_fgdb_vsi, layer="test_lines", read_geometry=False, max_features=1) assert "RIVER_MILE" in df.columns
def test_vsi_read_layers(naturalearth_lowres_vsi): assert array_equal(list_layers(naturalearth_lowres_vsi), [["naturalearth_lowres", "Polygon"]]) meta, geometry, fields = read(naturalearth_lowres_vsi) assert geometry.shape == (177, )
def convert_census_gdb( file, year=None, layers=None, level="bg", save_intermediate=True, combine=True, output_dir=".", ): """Convert file geodatabases from Census into (set of) parquet files. Parameters ---------- file : str path to file geodatabase year : str year that the data should be named by. If none, will try to infer from the filename based on convention from the Census Bureau FTP server layers : list, optional set of layers to extract from geodatabase. If none (default), all layers will be extracted level : str, optional geographic level of data ('bg' for blockgroups or 'tr' for tract), by default "bg" save_intermediate : bool, optional if true, each layer will be stored separately as a parquet file, by default True combine : bool, optional whether to store and concatenate intermediate dataframes, default is True output_dir : str, optional path to directory where parquet files will be written, by default "." """ try: import pyogrio as ogr except ImportError: raise ImportError("this function requires the `pyogrio` package\n" "`conda install pyogrio`") if not layers: # grab them all except the metadata year_suffix = file.split(".")[0].split("_")[1][-2:] meta_str = f"{level.upper()}_METADATA_20{year_suffix}" layers = [layer[0] for layer in ogr.list_layers(file)] if meta_str in layers: layers.remove(meta_str) if ( not year ): # make a strong assumption about the name of the file coming from census year = file.split("_")[1] tables = [] for i in layers: print(i) df = ogr.read_dataframe(file, layer=i).set_index("GEOID") if "ACS_" in i: df = gpd.GeoDataFrame(df) else: df = df[df.columns[df.columns.str.contains("e")]] df.columns = pd.Series(df.columns).apply(reformat_acs_vars) df = df.dropna(axis=1, how="all") if combine: tables.append(df) if save_intermediate: df.to_parquet( pathlib.PurePath(output_dir, f"acs_{year}_{i}_{level}.parquet")) if combine: df = pd.concat(tables, axis=1) if f"ACS_{year}_5YR_{level.upper()}" in layers: df = gpd.GeoDataFrame(df) df.to_parquet( pathlib.PurePath(output_dir, f"acs_{year}_{level}.parquet"))
from pyogrio import read_dataframe, list_layers from analysis.lib.util import append # NLCD natural landcover classes # descriptions here: https://www.mrlc.gov/data/legends/national-land-cover-database-2016-nlcd2016-legend NATURAL_TYPES = {11, 12, 31, 41, 42, 43, 51, 52, 71, 72, 73, 74, 90, 95} data_dir = Path("data") src_dir = data_dir / "floodplains" gdb_filename = src_dir / "NLCD2016_Floodplain_Stats_2020_12072020.gdb" # fixes run later for region 02 that have to be spliced in region02_gdb_filename = src_dir / "Region2FixedStats.gdb" # layers have varying names, make a lookup from them layers = list_layers(gdb_filename)[:, 0] layers = {re.search("\d+", l).group()[-2:]: l for l in layers} huc4_df = pd.read_feather(data_dir / "boundaries/huc4.feather", columns=["HUC2", "HUC4"]) # Convert to dict of sorted HUC4s per HUC2 units = huc4_df.groupby("HUC2").HUC4.unique().apply(sorted).to_dict() start = time() merged = None for huc2 in units.keys(): print(f"Processing floodplain stats for {huc2}") if huc2 == "02":