def create_fishnet(extents_file, out_folder, prefix, verbose=True): ''' Create a 1 km fishnet inside each feature in the input extents_file INPUT extents_file [string] - path to urban extents out_folder [string path] - where output shapefiles should be written prefix [string] - will be appended to each fidhnet shapefile ''' urban_extents = gpd.read_file(extents_file) #sel_cities = urban_extents.sort_values(['Pop'], ascending=False).iloc[0:5] sel_cities = urban_extents.sort_values(['Pop'], ascending=False) try: sel_cities = misc.project_UTM(sel_cities) except: sel_cities = sel_cities.to_crs({"init":"epsg:3857"}) for idx, row in sel_cities.iterrows(): out_fishnet = os.path.join(out_folder, "%s_%s.shp" % (prefix, row['ID'])) if not os.path.exists(out_fishnet): b = row['geometry'].bounds crs_num = sel_cities.crs['init'].split(":")[-1] crs_num = int(crs_num) misc.createFishnet(out_fishnet, b[0], b[2], b[1], b[3], 1000, 1000, crsNum=crs_num) fishnet = gpd.read_file(out_fishnet) fishnet = fishnet[fishnet.intersects(row['geometry'])] fishnet = fishnet.to_crs({'init':'epsg:4326'}) fishnet['geohash'] = fishnet['geometry'].apply(lambda x: geohash.encode(x.centroid.y, x.centroid.x)) fishnet.to_file(out_fishnet) if verbose: misc.tPrint("%s: %s" % (prefix, row['ID']))
def summarize_DHS(template, dhs_files, country_folder, iso3): ''' combine DHS data with WorldPop population and run zonal stats INPUT template [string] - template raster upon which to base rasterization of DHS dhs_files [dictionary] - defines DHS files to process {filename:geopandas} country_folder [string to path] - folder to create output ''' # Process DHS data inP = rasterio.open(template) # get a list of unique columns in the DHS data total_columns = 0 try: del(all_columns) except: pass # get a list of all unique columns for key, inD in dhs_files.items(): cur_columns = list(inD.columns.values) try: all_columns = all_columns + cur_columns except: all_columns = cur_columns col_count = Counter(all_columns) unq_columns = [key for key, value in col_count.items() if value == 1] dhs_rasters = {} for key, inD in dhs_files.items(): sel_dhs = inD.loc[inD['ISO3'] == iso3] if sel_dhs.shape[0] > 0: for field in inD.columns: if field in unq_columns: out_file = os.path.join(country_folder, f'{key}_{field}.tif') out_file_pop = os.path.join(country_folder, f'{key}_{field}_pop.tif') try: # rasterize the desired field in the inputDHS data if not os.path.exists(out_file) and not os.path.exists(out_file_pop): rMisc.rasterizeDataFrame(inD, out_file, idField=field, templateRaster = template) #Multiply the rasterized data frame by the population layer if not os.path.exists(out_file_pop): combine_dhs_pop(inP, rasterio.open(out_file), out_file_pop, factor=100) if os.path.exists(out_file): os.remove(out_file) misc.tPrint(f'{iso3}_{key}: {field}') dhs_rasters[f'{key}_{field}'] = { 'raster_file': f'{key}_{field}_pop.tif', 'vars': ['SUM', 'MEAN'], 'description': f'{key}_{field}' } except: misc.tPrint(f"Error processing {key} - {field}") return(dhs_rasters)
def run_zonal(admin_shapes, rasters, out_suffix='', iso3=''): ''' Calculate zonal results for submitted admin and raster INPUTS admin_shapes [geopandas] - features within which to calculate statistics rasters [dictionary] - data dictionary containing the raster and the required information { 'HNP_Var1':{ 'raster_file': 'path_to_raster', 'vars':['SUM','MEAN'], 'description':'Lorem Ipsum' } } out_suffix [string] - text to append to output zonal file ''' for shp in admin_shapes: inD = gpd.read_file(shp) out_zonal = shp.replace(".shp", "_zonal%s.csv" % out_suffix) misc.tPrint(f"Processed: {iso3} {os.path.basename(shp)}") write_out = False if not os.path.exists(out_zonal): for var_name, definition in rasters.items(): if os.path.exists(definition['raster_file']): write_out = True if definition['vars'][0] == 'C': uVals = definition['unqVals'] res = rMisc.zonalStats(inD, definition['raster_file'], rastType='C', unqVals=uVals, reProj=True) res = pd.DataFrame( res, columns=['LC_%s' % x for x in uVals]) for column in res.columns: inD[column] = res[column] else: # Zonal stats res = rMisc.zonalStats(inD, definition['raster_file'], minVal=0, reProj=True) res = pd.DataFrame( res, columns=['SUM', 'MIN', 'MAX', 'MEAN']) res.columns = [f"{var_name}_{x}" for x in res.columns] for var in definition['vars']: inD[f"{var_name}_{var}"] = res[f"{var_name}_{var}"] if write_out: inD.drop(['geometry'], axis=1, inplace=True) pd.DataFrame(inD).to_csv(out_zonal)
def run_all(iso3, output_folder, dhs_files): country_folder = os.path.join(output_folder, iso3) # extract national bounds misc.tPrint("Processing %s" % iso3) #summarize DHS country_pop = os.path.join(country_folder, "WP_2020_1km.tif") dhs_rasters = summarize_DHS(country_pop, dhs_files, country_folder, iso3) #Run zonal stats cur_rasters = copy.deepcopy(hnp_categories) for key, values in cur_rasters.items(): values['raster_file'] = os.path.join(country_folder, values['raster_file']) cur_rasters[key] = values cur_dhs = copy.deepcopy(dhs_rasters) for key, values in dhs_rasters.items(): values['raster_file'] = os.path.join(country_folder, values['raster_file']) cur_dhs[key] = values all_shps = [] for root, dirs, files, in os.walk(country_folder): for f in files: if f[-4:] == ".shp" and not "zonal" in f: all_shps.append(os.path.join(root, f)) run_zonal(all_shps, cur_rasters, out_suffix="_BASE", iso3=iso3) misc.tPrint("***%s Calculated Base Zonal" % iso3) run_zonal(all_shps, cur_dhs, out_suffix="_DHS", iso3=iso3) misc.tPrint("***%s Calculated DHS Zonal" % iso3)
def zonalStats(inShp, inRaster, bandNum=1, mask_A=None, reProj=False, minVal='', maxVal='', verbose=False, rastType='N', unqVals=[], weighted=False, allTouched=False): ''' Run zonal statistics against an input shapefile. Returns array of SUM, MIN, MAX, and MEAN INPUT VARIABLES inShp [string or geopandas object] - path to input shapefile inRaster [string or rasterio object] - path to input raster OPTIONAL bandNum [integer] - band in raster to analyze reProj [boolean] - whether to reproject data to match, if not, raise an error minVal/maxVal [number] - if defined, will only calculate statistics on values above or below this number verbose [boolean] - whether to be loud with technical updates rastType [string N or C] - N is numeric and C is categorical. Categorical returns counts of numbers unqVals [array of numbers] - used in categorical zonal statistics, tabulates all these numbers, will report 0 counts mask_A [numpy boolean mask] - mask the desired band using an identical shape boolean mask. Useful for doing conditional zonal stats weighted [boolean] - apply weighted zonal calculations. This will determine the % overlap for each cell in the defined AOI. Will apply weights in calculations of numerical statistics RETURNS array of arrays, one for each feature in inShp ''' if isinstance(inShp, str): inVector = gpd.read_file(inShp) else: inVector = inShp if isinstance(inRaster, str): curRaster = rasterio.open(inRaster, 'r') else: curRaster = inRaster # If mask is not none, apply mask if mask_A is not None: curRaster.write_mask(mask_A) outputData = [] if inVector.crs != curRaster.crs: if reProj: inVector = inVector.to_crs(curRaster.crs) else: raise ValueError("Input CRS do not match") fCount = 0 tCount = len(inVector['geometry']) #generate bounding box geometry for raster bbox b = curRaster.bounds rBox = box(b[0], b[1], b[2], b[3]) for idx, row in inVector.iterrows(): geometry = row['geometry'] fCount = fCount + 1 try: #This test is used in case the geometry extends beyond the edge of the raster # I think it is computationally heavy, but I don't know of an easier way to do it if not rBox.contains(geometry): geometry = geometry.intersection(rBox) try: if fCount % 1000 == 0 and verbose: tPrint("Processing %s of %s" % (fCount, tCount)) # get pixel coordinates of the geometry's bounding box ul = curRaster.index(*geometry.bounds[0:2]) lr = curRaster.index(*geometry.bounds[2:4]) # read the subset of the data into a numpy array window = ((float(lr[0]), float(ul[0] + 1)), (float(ul[1]), float(lr[1] + 1))) if mask_A is not None: data = curRaster.read(bandNum, window=window, masked=True) else: data = curRaster.read(bandNum, window=window, masked=False) if weighted: allTouched = True #Create a grid of the input raster (data) rGrid = polygonizeArray(data, geometry.bounds, curRaster) #Clip the grid by the input geometry rGrid['gArea'] = rGrid.area rGrid['newArea'] = rGrid.intersection(geometry).area #Store the percent overlap rGrid['w'] = rGrid['newArea'] / rGrid['gArea'] newData = data for idx, row in rGrid.iterrows(): newData[row['row'], row['col']] = data[row['row'], row['col']] * row['w'] data = newData # create an affine transform for the subset data t = curRaster.transform shifted_affine = Affine(t.a, t.b, t.c + ul[1] * t.a, t.d, t.e, t.f + lr[0] * t.e) # rasterize the geometry mask = rasterize([(geometry, 0)], out_shape=data.shape, transform=shifted_affine, fill=1, all_touched=allTouched, dtype=np.uint8) # create a masked numpy array masked_data = np.ma.array(data=data, mask=mask.astype(bool)) if rastType == 'N': if minVal != '' or maxVal != '': if minVal != '': masked_data = np.ma.masked_where( masked_data < minVal, masked_data) if maxVal != '': masked_data = np.ma.masked_where( masked_data > maxVal, masked_data) if masked_data.count() > 0: results = [ np.nansum(masked_data), np.nanmin(masked_data), np.nanmax(masked_data), np.nanmean(masked_data) ] else: results = [-1, -1, -1, -1] else: results = [ np.nansum(masked_data), np.nanmin(masked_data), np.nanmax(masked_data), np.nanmean(masked_data) ] if rastType == 'C': if len(unqVals) > 0: xx = dict(Counter(data.flatten())) results = [xx.get(i, 0) for i in unqVals] else: results = np.unique(masked_data, return_counts=True) outputData.append(results) except Exception as e: if verbose: print(e) if rastType == 'N': outputData.append([-1, -1, -1, -1]) else: outputData.append([-1 for x in unqVals]) except: print("Error processing %s" % fCount) return outputData
def extract_data(inG, inG1, inG2, inL, inR): country_folder = os.path.join(output_folder, iso3) adm0_file = os.path.join(country_folder, "adm0.shp") adm1_file = os.path.join(country_folder, "adm1.shp") adm2_file = os.path.join(country_folder, "adm2.shp") lc_file = os.path.join(country_folder, "LC.tif") if not os.path.exists(country_folder): os.makedirs(country_folder) country_bounds = inG.loc[inG['ISO3'] == iso3].to_crs({'init': 'epsg:4326'}) if not os.path.exists(adm0_file): country_bounds.to_file(adm0_file) if not os.path.exists(adm1_file): try: country_adm1 = inG1.loc[inG1['ISO3'] == iso3].to_crs( {'init': 'epsg:4326'}) country_adm1.to_file(adm1_file) except: misc.tPrint("%s Could not extract ADMIN 1" % iso3) if not os.path.exists(adm2_file): try: country_adm2 = inG2.loc[inG2['ISO3'] == iso3].to_crs( {'init': 'epsg:4326'}) country_adm2.to_file(adm2_file) except: misc.tPrint("%s Could not extract ADMIN 2" % iso3) if not os.path.exists(lc_file): rMisc.clipRaster(inL, gpd.read_file(adm0_file), lc_file) calculate_vulnerability(iso3, country_folder, country_bounds, pop_folder, pop_files) misc.tPrint("***%s Calculated Vulnerability" % iso3) try: create_urban_data(iso3, country_folder, country_bounds, inR, calc_urban=False) misc.tPrint("***%s Calculated Urban Extents" % iso3) except: misc.tPrint("%s errored on HD clusters" % iso3) try: create_urban_data(iso3, country_folder, country_bounds, inR, calc_urban=True, calc_hd_urban=False) except: misc.tPrint("%s errored on all clusters" % iso3)
def zonalStats(inShp, inRaster, bandNum=1, reProj=False, minVal='', verbose=False, rastType='N', unqVals=[]): ''' Run zonal statistics against an input shapefile INPUT VARIABLES inShp [string] - path to input shapefile inRaster [string] - path to input raster OPTIONAL bandNum [integer] - band in raster to analyze reProj [boolean] - whether to reproject data to match, if not, raise an error minVal [number] - if defined, will only calculation statistics on values above this number verbose [boolean] - whether to be loud with responses rastType [string N or C] - N is numeric and C is categorical. Categorical returns counts of numbers unqVals [array of numbers] - used in categorical zonal statistics, tabulates all these numbers, will report 0 counts RETURNS array of arrays, one for each feature in inShp ''' outputData = [] with rasterio.open(inRaster, 'r') as curRaster: inVector = gpd.read_file(inShp) if inVector.crs != curRaster.crs: if reProj: inVector = inVector.to_crs(curRaster.crs) else: raise ValueError("Input CRS do not match") fCount = 0 tCount = len(inVector['geometry']) for geometry in inVector['geometry']: fCount = fCount + 1 if fCount % 1000 == 0 and verbose: tPrint("Processing %s of %s" % (fCount, tCount)) # get pixel coordinates of the geometry's bounding box ul = curRaster.index(*geometry.bounds[0:2]) lr = curRaster.index(*geometry.bounds[2:4]) # read the subset of the data into a numpy array window = ((float(lr[0]), float(ul[0] + 1)), (float(ul[1]), float(lr[1] + 1))) try: data = curRaster.read(bandNum, window=window) # create an affine transform for the subset data t = curRaster.transform shifted_affine = Affine(t.a, t.b, t.c + ul[1] * t.a, t.d, t.e, t.f + lr[0] * t.e) # rasterize the geometry mask = rasterize([(geometry, 0)], out_shape=data.shape, transform=shifted_affine, fill=1, all_touched=True, dtype=np.uint8) # create a masked numpy array masked_data = np.ma.array(data=data, mask=mask.astype(bool)) if rastType == 'N': if minVal != '': masked_data = np.ma.masked_where( masked_data < minVal, masked_data) if masked_data.count() > 0: results = [ masked_data.sum(), masked_data.min(), masked_data.max(), masked_data.mean() ] else: results = [-1, -1, -1, -1] else: results = [ masked_data.sum(), masked_data.min(), masked_data.max(), masked_data.mean() ] if rastType == 'C': if len(unqVals) > 0: xx = dict(Counter(data.flatten())) results = [xx.get(i, 0) for i in unqVals] else: results = np.unique(masked_data, return_counts=True) outputData.append(results) except Exception as e: print(e) outputData.append([-1, -1, -1, -1]) return outputData