def merge_files(path, target, remove_source=False): """Merge files (the output of this Block) into one single file. Optionally removes the source files. """ path = utils.safe_abspath(path) target = utils.safe_abspath(target) if os.path.exists(target): raise IOError("Target '{}' already exists".format(target)) target_base, ext = os.path.splitext(target) source_paths = glob.glob(os.path.join(path, '*' + ext)) if len(source_paths) == 0: raise IOError( "No source files found with matching extension '{}'".format( ext)) elif len(source_paths) == 1: # shortcut for single file. we need to copy/move all base_name.* # files (e.g. shapefiles have multiple files) source_base = os.path.splitext(source_paths[0])[0] move_or_copy = shutil.move if remove_source else shutil.copy for file_path in glob.glob(source_base + '.*'): move_or_copy(file_path, target_base + os.path.splitext(file_path)[1]) return with utils.fiona_env(): # first detect the driver etc with fiona.collection(source_paths[0], "r") as source: kwargs = { "driver": source.driver, "crs": source.crs, "schema": source.schema, } if source.encoding: kwargs["encoding"] = source.encoding with fiona.collection(target, "w", **kwargs) as out: for source_path in source_paths: with fiona.collection(source_path, "r") as source: out.writerecords(v for k, v in source.items()) if remove_source: os.remove(source_path) if remove_source: try: os.rmdir(path) except IOError: # directory not empty: do nothing pass
def gdal_dataset(self): try: return self._gdal_dataset except AttributeError: path = utils.safe_abspath(self.url) self._gdal_dataset = gdal.Open(path) return self._gdal_dataset
def process(data, process_kwargs): if "features" not in data or len(data["features"]) == 0: return data # do nothing for non-feature or empty requests features = data["features"].copy() projection = data["projection"] path = utils.safe_abspath(process_kwargs["url"]) fields = process_kwargs["fields"] extension = process_kwargs["extension"] driver = GeometryFileSink.supported_extensions[extension] # generate the directory if necessary os.makedirs(path, exist_ok=True) # the target file path is a deterministic hash of the request filename = ".".join([process_kwargs["hash"], extension]) # add the index to the columns if necessary index_name = features.index.name if index_name in fields.values( ) and index_name not in features.columns: features[index_name] = features.index # copy the dataframe features = features[["geometry"] + list(fields.values())] # rename the columns features.columns = ["geometry"] + list(fields.keys()) # generate the file features.to_file(os.path.join(path, filename), driver=driver) result = geopandas.GeoDataFrame(index=features.index) result["saved"] = True return {"features": result, "projection": projection}
def process(data, process_kwargs): if "features" not in data or len(data["features"]) == 0: return data # do nothing for non-feature or empty requests features = data["features"].copy() projection = data["projection"] path = utils.safe_abspath(process_kwargs["url"]) fields = process_kwargs["fields"] extension = process_kwargs["extension"] driver = GeometryFileSink.supported_extensions[extension] # generate the directory if necessary os.makedirs(path, exist_ok=True) # the target file path is a deterministic hash of the request filename = ".".join([process_kwargs["hash"], extension]) # add the index to the columns if necessary index_name = features.index.name if index_name in fields.values( ) and index_name not in features.columns: features[index_name] = features.index # copy the dataframe features = features[["geometry"] + list(fields.values())] # rename the columns features.columns = ["geometry"] + list(fields.keys()) # serialize nested fields (lists or dicts) for col in fields.keys(): series = features[col] if series.dtype == object or (str(series.dtype) == "category" and series.cat.categories.dtype == object): features[col] = series.map(_to_json) # convert categoricals for col in fields.keys(): series = features[col] if str(series.dtype) == "category": features[col] = series.astype(series.cat.categories.dtype) # GeoJSON needs reprojection to EPSG:4326 if driver == "GeoJSON" and projection.upper() != "EPSG:4326": features = utils.geodataframe_transform(features, projection, "EPSG:4326") # generate the file features.to_file(os.path.join(path, filename), driver=driver) result = geopandas.GeoDataFrame(index=features.index) result["saved"] = True return {"features": result, "projection": projection}
def process(process_kwargs): mode = process_kwargs["mode"] # handle empty requests if mode == "empty_vals": return elif mode == "empty_time": return {"time": []} elif mode == "empty_meta": return {"meta": []} # handle time requests if mode == "time": start = process_kwargs["start"] length = process_kwargs["length"] delta = process_kwargs["delta"] return {"time": [start + i * delta for i in range(length)]} # open the dataset url = process_kwargs["url"] path = utils.safe_abspath(url) dataset = gdal.Open(path) first_band = process_kwargs["first_band"] last_band = process_kwargs["last_band"] # handle meta requests if mode == "meta": return { "meta": [ dataset.GetRasterBand(i + 1).GetMetadata_Dict() for i in range(first_band, last_band + 1) ] } # handle 'vals' requests dtype = process_kwargs["dtype"] no_data_value = process_kwargs["fillvalue"] bbox = process_kwargs["bbox"] width = process_kwargs["width"] height = process_kwargs["height"] length = last_band - first_band + 1 # return an empty array if 0-sized data was requested if width == 0 or height == 0: return np.empty((length, height, width), dtype=dtype) # transform the requested bounding box to indices into the array shape = dataset.RasterCount, dataset.RasterYSize, dataset.RasterXSize gt = utils.GeoTransform(dataset.GetGeoTransform()) ranges, padding = gt.get_array_ranges(bbox, shape) read_shape = [rng[1] - rng[0] for rng in ranges] # return nodata immediately for empty if any([x <= 0 for x in read_shape]): result = np.full( shape=(length, height, width), fill_value=no_data_value, dtype=dtype ) return {"values": result, "no_data_value": no_data_value} # read arrays from file result = np.empty([length] + read_shape, dtype=dtype) for k in range(length): band = dataset.GetRasterBand(first_band + k + 1) result[k] = band.ReadAsArray( int(ranges[1][0]), int(ranges[0][0]), int(read_shape[1]), int(read_shape[0]), ) # pad the data to the shape given by the index if padding is not None: padding = ((0, 0),) + padding # for the time axis result = np.pad(result, padding, "constant", constant_values=no_data_value) # zoom to the desired height and width result = utils.zoom_raster(result, no_data_value, height, width) # fill nan values if they popped up result[~np.isfinite(result)] = no_data_value return {"values": result, "no_data_value": no_data_value}
def process(url, request): path = utils.safe_abspath(url) # convert the requested projection to a geopandas CRS crs = utils.get_crs(request["projection"]) # convert the requested shapely geometry object to a GeoSeries filt_geom = gpd.GeoSeries([request["geometry"]], crs=crs) # acquire the data, filtering on the filt_geom bbox f = gpd.GeoDataFrame.from_file(path, bbox=filt_geom, layer=request["layer"]) if len(f) == 0: # return directly if there is no data if request.get("mode") == "extent": return {"projection": request["projection"], "extent": None} else: # this takes modes 'centroid' and 'intersects' return { "projection": request["projection"], "features": gpd.GeoDataFrame([]), } f.set_index(request["id_field"], inplace=True) # apply the non-geometry field filters first mask = None for field, value in request["filters"].items(): if field not in f.columns: continue _mask = f[field] == value if mask is None: mask = _mask else: mask &= _mask if mask is not None: f = f[mask] # convert the data to the requested crs utils.geodataframe_transform(f, utils.crs_to_srs(f.crs), request["projection"]) # compute the bounds of each geometry and filter on min_size min_size = request.get("min_size") if min_size: bounds = f["geometry"].bounds widths = bounds["maxx"] - bounds["minx"] heights = bounds["maxy"] - bounds["miny"] f = f[(widths > min_size) | (heights > min_size)] # only return geometries that truly intersect the requested geometry if request["mode"] == "centroid": with warnings.catch_warnings(): # geopandas warns if in WGS84 warnings.simplefilter("ignore") f = f[f["geometry"].centroid.within(filt_geom.iloc[0])] else: f = f[f["geometry"].intersects(filt_geom.iloc[0])] if request.get("mode") == "extent": return { "projection": request["projection"], "extent": tuple(f.total_bounds), } else: # this takes modes 'centroid' and 'intersects' # truncate the number of geometries if necessary if request.get("limit") and len(f) > request["limit"]: f = f.iloc[:request["limit"]] elif request.get("limit") is None: global_limit = config.get("geomodeling.geometry-limit") if len(f) > global_limit: raise RuntimeError( "The amount of returned geometries exceeded " "the maximum of {} geometries.".format(global_limit)) return {"projection": request["projection"], "features": f}
def path(self): return utils.safe_abspath(self.url)
def to_file(source, url, fields=None, tile_size=None, dry_run=False, **request): """Utility function to export data from a GeometryBlock to a file on disk. You need to specify the target file path as well as the extent geometry you want to save. Args: source (GeometryBlock): the block the data is coming from url (str): The target file path. The extension determines the format. For supported formats, consult GeometryFileSink.supported_extensions. fields (dict): a mapping that relates column names to output file field names field names, ``{<output file field name>: <column name>, ...}``. tile_size (int): Optionally use this for large exports to stay within memory constraints. The export is split in tiles of given size (units are determined by the projection). Finally the tiles are merged. dry_run (bool): Do nothing, only validate the arguments. geometry (shapely Geometry): Limit exported objects to objects whose centroid intersects with this geometry. projection (str): The projection as a WKT string or EPSG code. Sets the projection of the geometry argument, the target projection of the data, and the tiling projection. mode (str): one of ``{"intersects", "centroid"}``, default "centroid" start (datetime): start date as UTC datetime stop (datetime): stop date as UTC datetime **request: see GeometryBlock request specification Relevant settings can be adapted as follows: >>> from dask import config >>> config.set({"geomodeling.root": '/my/output/data/path'}) >>> config.set({"temporary_directory": '/my/alternative/tmp/dir'}) """ if "mode" not in request: request["mode"] = "centroid" path = utils.safe_abspath(url) extension = os.path.splitext(path)[1] TmpDir = DryRunTempDir if dry_run else tempfile.TemporaryDirectory with TmpDir(dir=config.get("temporary_directory", None)) as tmpdir: sink = GeometryFileSink(source, tmpdir, extension=extension, fields=fields) # wrap the sink in a GeometryTiler if tile_size is not None: sink = GeometryTiler(sink, tile_size, request["projection"]) if dry_run: return # export the dataset to the tmpdir (full dataset or multiple tiles) sink.get_data(**request) # copy the file / gather the tiles to the target location GeometryFileSink.merge_files(tmpdir, path)