def area_tables(source_df, target_df): """ Construct area allocation and source-target correspondence tables Parameters ---------- source_df: geopandas GeoDataFrame with geometry column of polygon type target_df: geopandas GeoDataFrame with geometry column of polygon type Returns ------- tables: tuple (optional) two 2-D numpy arrays SU: area of intersection of source geometry i with union geometry j UT: binary mapping of union geometry j to target geometry t Notes ----- The assumption is both dataframes have the same coordinate reference system. Union geometry is a geometry formed by the intersection of a source geometry and a target geometry SU Maps source geometry to union geometry, UT maps union geometry to target geometry """ if _check_crs(source_df, target_df): pass else: return None n_s = source_df.shape[0] n_t = target_df.shape[0] _left = np.arange(n_s) _right = np.arange(n_t) source_df.loc[:, "_left"] = _left # create temporary index for union target_df.loc[:, "_right"] = _right # create temporary index for union res_union = gpd.overlay(source_df, target_df, how="union") n_u, _ = res_union.shape SU = np.zeros( (n_s, n_u)) # holds area of intersection of source geom with union geom UT = np.zeros((n_u, n_t)) # binary table mapping union geom to target geom for index, row in res_union.iterrows(): # only union polygons that intersect both a source and a target geometry matter if not np.isnan(row["_left"]) and not np.isnan(row["_right"]): s_id = int(row["_left"]) t_id = int(row["_right"]) SU[s_id, index] = row["geometry"].area UT[index, t_id] = 1 source_df.drop(["_left"], axis=1, inplace=True) target_df.drop(["_right"], axis=1, inplace=True) return SU, UT
def _area_tables_binning_parallel(source_df, target_df, n_jobs=-1): """Construct area allocation and source-target correspondence tables using a parallel spatial indexing approach ... NOTE: currently, the largest df is chunked and the other one is shipped in full to each core; within each process, the spatial index is built for the largest set of geometries, and the other one used for `query_bulk` Parameters ---------- source_df : geopandas.GeoDataFrame GeoDataFrame containing input data and polygons target_df : geopandas.GeoDataFramee GeoDataFrame defining the output geometries n_jobs : int [Optional. Default=-1] Number of processes to run in parallel. If -1, this is set to the number of CPUs available Returns ------- tables : scipy.sparse.dok_matrix """ from joblib import Parallel, delayed, parallel_backend if _check_crs(source_df, target_df): pass else: return None if n_jobs == -1: n_jobs = os.cpu_count() df1 = source_df.copy() df2 = target_df.copy() # Chunk the largest, ship the smallest in full if df1.shape[0] > df2.shape[1]: to_chunk = df1 df_full = df2 else: to_chunk = df2 df_full = df1 # Spatial index query ## Reindex on positional IDs to_workers = _chunk_dfs( gpd.GeoSeries(to_chunk.geometry.values, crs=to_chunk.crs), gpd.GeoSeries(df_full.geometry.values, crs=df_full.crs), n_jobs, ) with parallel_backend("loky", inner_max_num_threads=1): worker_out = Parallel(n_jobs=n_jobs)( delayed(_index_n_query)(*chunk_pair) for chunk_pair in to_workers) ids_src, ids_tgt = np.concatenate(worker_out).T # Intersection + area calculation chunks_to_intersection = _chunk_polys( np.vstack([ids_src, ids_tgt]).T, df1.geometry, df2.geometry, n_jobs) with parallel_backend("loky", inner_max_num_threads=1): worker_out = Parallel(n_jobs=n_jobs)( delayed(_intersect_area_on_chunk)(*chunk_pair) for chunk_pair in chunks_to_intersection) areas = np.concatenate(worker_out) # Build DOK table table = coo_matrix( ( areas, (ids_src, ids_tgt), ), shape=(df1.shape[0], df2.shape[0]), dtype=np.float32, ) table = table.todok() return table
def _area_interpolate_binning( source_df, target_df, extensive_variables=None, intensive_variables=None, table=None, allocate_total=True, spatial_index="auto", n_jobs=1, categorical_variables=None, ): """ Area interpolation for extensive, intensive and categorical variables. Parameters ---------- source_df : geopandas.GeoDataFrame target_df : geopandas.GeoDataFrame extensive_variables : list [Optional. Default=None] Columns in dataframes for extensive variables intensive_variables : list [Optional. Default=None] Columns in dataframes for intensive variables table : scipy.sparse.dok_matrix [Optional. Default=None] Area allocation source-target correspondence table. If not provided, it will be built from `source_df` and `target_df` using `tobler.area_interpolate._area_tables_binning` allocate_total : boolean [Optional. Default=True] True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. spatial_index : str [Optional. Default="auto"] Spatial index to use to build the allocation of area from source to target tables. It currently support the following values: - "source": build the spatial index on `source_df` - "target": build the spatial index on `target_df` - "auto": attempts to guess the most efficient alternative. Currently, this option uses the largest table to build the index, and performs a `bulk_query` on the shorter table. This argument is ignored if n_jobs>1 (or n_jobs=-1). n_jobs : int [Optional. Default=1] Number of processes to run in parallel to generate the area allocation. If -1, this is set to the number of CPUs available. If `table` is passed, this is ignored. NOTE: as of Jan'21 multi-core functionality requires master versions of `pygeos` and `geopandas`. categorical_variables : list [Optional. Default=None] Columns in dataframes for categorical variables Returns ------- estimates : geopandas.GeoDataFrame new geodaraframe with interpolated variables as columns and target_df geometry as output geometry Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{k,j} For categorical variables, the estimate returns ratio of presence of each unique category. """ source_df = source_df.copy() target_df = target_df.copy() if _check_crs(source_df, target_df): pass else: return None if table is None: if n_jobs == 1: table = _area_tables_binning(source_df, target_df, spatial_index) else: table = _area_tables_binning_parallel(source_df, target_df, n_jobs=n_jobs) den = source_df.area.values if allocate_total: den = np.asarray(table.sum(axis=1)) den = den + (den == 0) den = 1.0 / den n = den.shape[0] den = den.reshape((n, )) den = diags([den], [0]) weights = den.dot(table) # row standardize table dfs = [] extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) estimates = diags([vals], [0]).dot(weights) estimates = estimates.sum(axis=0) extensive.append(estimates.tolist()[0]) extensive = np.asarray(extensive) extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) area = np.asarray(table.sum(axis=0)) den = 1.0 / (area + (area == 0)) n, k = den.shape den = den.reshape((k, )) den = diags([den], [0]) weights = table.dot(den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) n = vals.shape[0] vals = vals.reshape((n, )) estimates = diags([vals], [0]) estimates = estimates.dot(weights).sum(axis=0) intensive.append(estimates.tolist()[0]) intensive = np.asarray(intensive) intensive = pd.DataFrame(intensive.T, columns=intensive_variables) if categorical_variables: categorical = {} for variable in categorical_variables: unique = source_df[variable].unique() for value in unique: mask = source_df[variable] == value categorical[f"{variable}_{value}"] = np.asarray( table[mask].sum(axis=0))[0] categorical = pd.DataFrame(categorical) categorical = categorical.div(target_df.area.values, axis="rows") if extensive_variables: dfs.append(extensive) if intensive_variables: dfs.append(intensive) if categorical_variables: dfs.append(categorical) df = pd.concat(dfs, axis=1) df["geometry"] = target_df[target_df.geometry.name].reset_index(drop=True) df = gpd.GeoDataFrame(df.replace(np.inf, np.nan)) return df
def _area_tables_binning(source_df, target_df, spatial_index): """Construct area allocation and source-target correspondence tables using a spatial indexing approach ... NOTE: this currently relies on Geopandas' spatial index machinery Parameters ---------- source_df : geopandas.GeoDataFrame GeoDataFrame containing input data and polygons target_df : geopandas.GeoDataFramee GeoDataFrame defining the output geometries spatial_index : str Spatial index to use to build the allocation of area from source to target tables. It currently support the following values: - "source": build the spatial index on `source_df` - "target": build the spatial index on `target_df` - "auto": attempts to guess the most efficient alternative. Currently, this option uses the largest table to build the index, and performs a `bulk_query` on the shorter table. Returns ------- tables : scipy.sparse.dok_matrix """ if _check_crs(source_df, target_df): pass else: return None df1 = source_df.copy() df2 = target_df.copy() # it is generally more performant to use the longer df as spatial index if spatial_index == "auto": if df1.shape[0] > df2.shape[0]: spatial_index = "source" else: spatial_index = "target" if spatial_index == "source": ids_tgt, ids_src = df1.sindex.query_bulk(df2.geometry, predicate="intersects") elif spatial_index == "target": ids_src, ids_tgt = df2.sindex.query_bulk(df1.geometry, predicate="intersects") else: raise ValueError( f"'{spatial_index}' is not a valid option. Use 'auto', 'source' or 'target'." ) areas = df1.geometry.values[ids_src].intersection( df2.geometry.values[ids_tgt]).area table = coo_matrix( ( areas, (ids_src, ids_tgt), ), shape=(df1.shape[0], df2.shape[0]), dtype=np.float32, ) table = table.todok() return table
def area_tables_raster(source_df, target_df, raster_path, codes=[21, 22, 23, 24], force_crs_match=True): """ Construct area allocation and source-target correspondence tables according to a raster 'populated' areas Parameters ---------- source_df : geopandas GeoDataFrame with geometry column of polygon type target_df : geopandas GeoDataFrame with geometry column of polygon type raster_path : the path to the associated raster image. codes : an integer list of codes values that should be considered as 'populated'. Since this draw inspiration using the National Land Cover Database (NLCD), the default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity). The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html Only taken into consideration for harmonization raster based. force_crs_match : bool. Default is True. Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. It is recommended to let this argument as True. Returns ------- tables: tuple (optional) two 2-D numpy arrays SU: area of intersection of source geometry i with union geometry j UT: binary mapping of union geometry j to target geometry t Notes ----- The assumption is both dataframes have the same coordinate reference system. Union geometry is a geometry formed by the intersection of a source geometry and a target geometry SU Maps source geometry to union geometry, UT maps union geometry to target geometry """ if _check_crs(source_df, target_df): pass else: return None n_s = source_df.shape[0] n_t = target_df.shape[0] _left = np.arange(n_s) _right = np.arange(n_t) source_df.loc[:, "_left"] = _left # create temporary index for union target_df.loc[:, "_right"] = _right # create temporary index for union res_union_pre = gpd.overlay(source_df, target_df, how="union") # Establishing a CRS for the generated union warnings.warn( "The CRS for the generated union will be set to be the same as source_df." ) res_union_pre.crs = source_df.crs # The 'append_profile_in_gdf' function is present in nlcd.py script res_union = fast_append_profile_in_gdf(res_union_pre, raster_path, force_crs_match=force_crs_match) str_codes = [str(i) for i in codes] str_list = ["Type_" + i for i in str_codes] # Extract list of code names that actually appear in the appended dataset str_list_ok = [col for col in res_union.columns if col in str_list] res_union["Populated_Pixels"] = res_union[str_list_ok].sum(axis=1) n_u, _ = res_union.shape SU = np.zeros( (n_s, n_u)) # holds area of intersection of source geom with union geom UT = np.zeros((n_u, n_t)) # binary table mapping union geom to target geom for index, row in res_union.iterrows(): # only union polygons that intersect both a source and a target geometry matter if not np.isnan(row["_left"]) and not np.isnan(row["_right"]): s_id = int(row["_left"]) t_id = int(row["_right"]) SU[s_id, index] = row["Populated_Pixels"] UT[index, t_id] = 1 source_df.drop(["_left"], axis=1, inplace=True) target_df.drop(["_right"], axis=1, inplace=True) return SU, UT
def area_interpolate( source_df, target_df, extensive_variables=[], intensive_variables=[], tables=None, allocate_total=True, ): """ Area interpolation for extensive and intensive variables. Parameters ---------- source_df: geopandas GeoDataFrame with geometry column of polygon type target_df: geopandas GeoDataFrame with geometry column of polygon type extensive_variables: list of columns in dataframes for extensive variables intensive_variables: list of columns in dataframes for intensive variables tables: tuple (optional) two 2-D numpy arrays SU: area of intersection of source geometry i with union geometry j UT: binary mapping of union geometry j to target geometry t allocate_total: boolean True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. Returns ------- estimates: tuple (2) (extensive variable array, intensive variables array) Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{k,j} """ if _check_crs(source_df, target_df): pass else: return None if tables is None: SU, UT = area_tables(source_df, target_df) else: SU, UT = tables den = source_df["geometry"].area.values if allocate_total: den = SU.sum(axis=1) den = den + (den == 0) weights = np.dot(np.diag(1 / den), SU) extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) estimates = np.dot(np.diag(vals), weights) estimates = np.dot(estimates, UT) estimates = estimates.sum(axis=0) extensive.append(estimates) extensive = np.array(extensive) ST = np.dot(SU, UT) area = ST.sum(axis=0) den = np.diag(1.0 / (area + (area == 0))) weights = np.dot(ST, den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) vals.shape = (len(vals), 1) est = (vals * weights).sum(axis=0) intensive.append(est) intensive = np.array(intensive) return (extensive.T, intensive.T)
def area_interpolate_binning( source_df, target_df, extensive_variables=[], intensive_variables=[], table=None, allocate_total=True, ): """ Area interpolation for extensive and intensive variables. Parameters ---------- source_df: geopandas GeoDataFrame with geometry column of polygon type target_df: geopandas GeoDataFrame with geometry column of polygon type extensive_variables: list of columns in dataframes for extensive variables intensive_variables: list of columns in dataframes for intensive variables table: scipy.sparse dok_matrix allocate_total: boolean True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. Returns ------- estimates: tuple (2) (extensive variable array, intensive variables array) Each is of shape n,v where n is number of target units and v is the number of variables for each variable type. Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{k,j} """ if _check_crs(source_df, target_df): pass else: return None if table is None: table = area_tables_binning(source_df, target_df) den = source_df["geometry"].area.values if allocate_total: den = np.asarray(table.sum(axis=1)) den = den + (den == 0) den = 1.0 / den n = den.shape[0] den = den.reshape((n, )) den = diags([den], [0]) weights = den.dot(table) # row standardize table extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) estimates = diags([vals], [0]).dot(weights) estimates = estimates.sum(axis=0) extensive.append(estimates.tolist()[0]) extensive = np.asarray(extensive) area = np.asarray(table.sum(axis=0)) den = 1.0 / (area + (area == 0)) n, k = den.shape den = den.reshape((k, )) den = diags([den], [0]) weights = table.dot(den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) n = vals.shape[0] vals = vals.reshape((n, )) estimates = diags([vals], [0]) estimates = estimates.dot(weights).sum(axis=0) intensive.append(estimates.tolist()[0]) intensive = np.asarray(intensive) return (extensive.T, intensive.T)
def area_tables_binning(source_df, target_df): if _check_crs(source_df, target_df): pass else: return None df1 = source_df df2 = target_df l1, b1, r1, t1 = df1.total_bounds l2, b2, r2, t2 = df2.total_bounds total_bounds = [min(l1, l2), min(b1, b2), max(r1, r2), max(t1, t2)] n1, k1 = df1.shape n2, k2 = df2.shape numPoly = n1 + n2 DELTA = 0.000001 # constants for bucket sizes BUCK_SM = 8 BUCK_LG = 80 SHP_SMALL = 1000 shapebox = total_bounds # bucket size if numPoly < SHP_SMALL: bucketmin = numPoly // BUCK_SM + 2 else: bucketmin = numPoly // BUCK_LG + 2 # print 'bucketmin: ', bucketmin # bucket length lengthx = ((shapebox[2] + DELTA) - shapebox[0]) / bucketmin lengthy = ((shapebox[3] + DELTA) - shapebox[1]) / bucketmin # initialize buckets columns1 = [set() for i in range(bucketmin)] rows1 = [set() for i in range(bucketmin)] columns2 = [set() for i in range(bucketmin)] rows2 = [set() for i in range(bucketmin)] minbox = shapebox[:2] * 2 # minx,miny,minx,miny binWidth = [lengthx, lengthy] * 2 # lenx,leny,lenx,leny bbcache = {} poly2Column1 = [set() for i in range(n1)] poly2Row1 = [set() for i in range(n1)] poly2Column2 = [set() for i in range(n2)] poly2Row2 = [set() for i in range(n2)] for i in range(n1): shpObj = df1.geometry.iloc[i] bbcache[i] = shpObj.bounds projBBox = [ int((shpObj.bounds[:][j] - minbox[j]) / binWidth[j]) for j in range(4) ] for j in range(projBBox[0], projBBox[2] + 1): columns1[j].add(i) poly2Column1[i].add(j) for j in range(projBBox[1], projBBox[3] + 1): rows1[j].add(i) poly2Row1[i].add(j) for i in range(n2): shpObj = df2.geometry.iloc[i] bbcache[i] = shpObj.bounds projBBox = [ int((shpObj.bounds[:][j] - minbox[j]) / binWidth[j]) for j in range(4) ] for j in range(projBBox[0], projBBox[2] + 1): columns2[j].add(i) poly2Column2[i].add(j) for j in range(projBBox[1], projBBox[3] + 1): rows2[j].add(i) poly2Row2[i].add(j) table = dok_matrix((n1, n2), dtype=np.float32) for polyId in range(n1): idRows = poly2Row1[polyId] idCols = poly2Column1[polyId] rowNeighbors = set() colNeighbors = set() for row in idRows: rowNeighbors = rowNeighbors.union(rows2[row]) for col in idCols: colNeighbors = colNeighbors.union(columns2[col]) neighbors = rowNeighbors.intersection(colNeighbors) for neighbor in neighbors: if df1.geometry.iloc[polyId].intersects( df2.geometry.iloc[neighbor]): intersection = df1.geometry.iloc[polyId].intersection( df2.geometry.iloc[neighbor]) table[polyId, neighbor] = intersection.area return table
def _area_interpolate_binning( source_df, target_df, extensive_variables=None, intensive_variables=None, table=None, allocate_total=True, ): """ Area interpolation for extensive and intensive variables. Parameters ---------- source_df : geopandas.GeoDataFrame target_df : geopandas.GeoDataFrame extensive_variables : list columns in dataframes for extensive variables intensive_variables : list columns in dataframes for intensive variables table : scipy.sparse.dok_matrix allocate_total : boolean True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. Returns ------- estimates : geopandas.GeoDataFrame new geodaraframe with interpolated variables as columns and target_df geometry as output geometry Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{k,j} """ source_df = source_df.copy() target_df = target_df.copy() if _check_crs(source_df, target_df): pass else: return None if table is None: table = _area_tables_binning(source_df, target_df) den = source_df["geometry"].area.values if allocate_total: den = np.asarray(table.sum(axis=1)) den = den + (den == 0) den = 1.0 / den n = den.shape[0] den = den.reshape((n, )) den = diags([den], [0]) weights = den.dot(table) # row standardize table dfs = [] extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) estimates = diags([vals], [0]).dot(weights) estimates = estimates.sum(axis=0) extensive.append(estimates.tolist()[0]) extensive = np.asarray(extensive) extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) area = np.asarray(table.sum(axis=0)) den = 1.0 / (area + (area == 0)) n, k = den.shape den = den.reshape((k, )) den = diags([den], [0]) weights = table.dot(den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) n = vals.shape[0] vals = vals.reshape((n, )) estimates = diags([vals], [0]) estimates = estimates.dot(weights).sum(axis=0) intensive.append(estimates.tolist()[0]) intensive = np.asarray(intensive) intensive = pd.DataFrame(intensive.T, columns=intensive_variables) if extensive_variables: dfs.append(extensive) if intensive_variables: dfs.append(intensive) df = pd.concat(dfs, axis=1) df["geometry"] = target_df["geometry"].reset_index(drop=True) df = gpd.GeoDataFrame(df.replace(np.inf, np.nan)) return df
def _area_interpolate( source_df, target_df, extensive_variables=None, intensive_variables=None, tables=None, allocate_total=True, ): """ Area interpolation for extensive and intensive variables. Parameters ---------- source_df : geopandas.GeoDataFrame (required) geodataframe with polygon geometries target_df : geopandas.GeoDataFrame (required) geodataframe with polygon geometries extensive_variables : list, (optional) columns in dataframes for extensive variables intensive_variables : list, (optional) columns in dataframes for intensive variables tables : tuple (optional) two 2-D numpy arrays SU: area of intersection of source geometry i with union geometry j UT: binary mapping of union geometry j to target geometry t allocate_total : boolean True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. Returns ------- estimates : geopandas.GeoDataFrame new geodaraframe with interpolated variables as columns and target_df geometry as output geometry Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{k,j} """ source_df = source_df.copy() target_df = target_df.copy() if _check_crs(source_df, target_df): pass else: return None if tables is None: SU, UT = _area_tables(source_df, target_df) else: SU, UT = tables den = source_df[source_df.geometry.name].area.values if allocate_total: den = SU.sum(axis=1) den = den + (den == 0) weights = np.dot(np.diag(1 / den), SU) dfs = [] extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) estimates = np.dot(np.diag(vals), weights) estimates = np.dot(estimates, UT) estimates = estimates.sum(axis=0) extensive.append(estimates) extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) ST = np.dot(SU, UT) area = ST.sum(axis=0) den = np.diag(1.0 / (area + (area == 0))) weights = np.dot(ST, den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) vals.shape = (len(vals), 1) est = (vals * weights).sum(axis=0) intensive.append(est) intensive = np.array(intensive) intensive = pd.DataFrame(intensive.T, columns=intensive_variables) if extensive_variables: dfs.append(extensive) if intensive_variables: dfs.append(intensive) df = pd.concat(dfs, axis=1) df["geometry"] = target_df[target_df.geometry.name].reset_index(drop=True) df = gpd.GeoDataFrame(df.replace(np.inf, np.nan)) return df
def scanlines_interpolate(target_gdf, source_CTs, weights_long, raster_path, verbose=True): """Function that generates the interpolated values using scanlines with a given set of weights and Correction Terms using scanlines Parameters ---------- target_gdf : geopandas GeoDataFrame with geometry column of polygon type for the target set of polygons desired. source_CTs : geopandas GeoDataFrame with the Correction Terms for source polygons. weights_long : a numpy array with the weights for all land types in the raster. raster_path : the path to the associated raster image. verbose : bool. Default is False. Wheter the function will print progress steps. """ t0_aux = time.time() _check_presence_of_crs(target_gdf) _check_presence_of_crs(source_CTs) if _check_crs(source_CTs, target_gdf): pass else: return None if verbose: print('Opening raster metadata...') raster = rasterio.open(raster_path) if verbose: print('Matching both crs\'s (reprojecting source_CTs to raster)...') source_CTs = source_CTs.to_crs(crs=raster.crs.data) if verbose: print('...reprojecting target_gdf to raster)...') target_gdf = target_gdf.to_crs(crs=raster.crs.data) # Check if Operational System is Windows sep_cmd = ";" if os.name == 'nt' else ":" if ('geometry' not in target_gdf.columns): target_gdf['geometry'] = target_gdf[target_gdf._geometry_column_name] target_gdf = target_gdf.drop([target_gdf._geometry_column_name], axis=1) target_gdf = target_gdf.set_geometry('geometry') if ('geometry' not in source_CTs.columns): source_CTs['geometry'] = source_CTs[source_CTs._geometry_column_name] source_CTs = source_CTs.drop([source_CTs._geometry_column_name], axis=1) source_CTs = source_CTs.set_geometry('geometry') # Create a temporary directory for ALL input files with tempfile.TemporaryDirectory() as temp_dir: # parquet like internal file if verbose: print('Starting to create well-known text (wkt) of geometries...') target_gdf['geometry_wkt'] = target_gdf['geometry'].apply( lambda x: x.wkt ) # Create well-know text (raw text) for the geometry column target_gdf = target_gdf.drop(['geometry'], axis=1) target_gdf_temp_file_name = os.path.join(temp_dir, 'target_gdf_temp.parquet') # Just extract the useful column for optimization target_gdf = target_gdf[['geometry_wkt']] if verbose: print( 'Starting to convert the GeoDataFrame to a temporary file...') target_gdf.to_parquet(target_gdf_temp_file_name) # parquet like internal file if verbose: print( 'Source CT: Starting to create well-known text (wkt) of geometries...' ) source_CTs['geometry_wkt'] = source_CTs['geometry'].apply( lambda x: x.wkt ) # Create well-know text (raw text) for the geometry column source_CTs = source_CTs.drop(['geometry'], axis=1) source_CTs_temp_file_name = os.path.join(temp_dir, 'source_CTs_temp.parquet') # Just extract the useful column for optimization # For source we need also the Correction Terms! source_CTs = source_CTs[['geometry_wkt', 'CT']] if verbose: print( 'Starting to convert the GeoDataFrame to a temporary file...') source_CTs.to_parquet(source_CTs_temp_file_name) weights_temp_file_name = os.path.join(temp_dir, 'input_weights.csv') np.savetxt(weights_temp_file_name, weights_long, delimiter=",", header='weights', comments='') cmd_pre = os.path.join('java -client -cp dependency', '*') cmd = cmd_pre + "{}ucrspatial-6.0-SNAPSHOT.jar interpolate {} {} {} {}".format( sep_cmd, raster_path, source_CTs_temp_file_name, target_gdf_temp_file_name, weights_temp_file_name) t1_aux = time.time() if verbose: print( 'Time of preparation before scanline (in seconds): {}'.format( t1_aux - t0_aux)) if verbose: print('Starting to perform the scanline...') t0_aux = time.time() run(cmd, shell=True, check=True ) # Will generate an parquet for output: interpolate.parquet t1_aux = time.time() if verbose: print('Scanline: Done.') if verbose: print('Time of scanline itself (in seconds): {}'.format(t1_aux - t0_aux)) interpolated_df = pd.read_parquet("interpolate.parquet") os.remove("interpolate.parquet") return interpolated_df