Пример #1
0
def test_parquet_roundtrip(tmp_path):
    # basic roundtrip
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    ddf = dask_geopandas.from_geopandas(df, npartitions=4)

    basedir = tmp_path / "dataset"
    ddf.to_parquet(basedir)

    # each partition (4) is written as parquet file
    paths = list(basedir.glob("*.parquet"))
    assert len(paths) == 4

    # reading back gives identical GeoDataFrame
    result = dask_geopandas.read_parquet(basedir)
    assert ddf.npartitions == 4
    assert_geodataframe_equal(result.compute(), df)

    # the written dataset is also readable by plain geopandas
    result_gpd = geopandas.read_parquet(basedir)
    # the dataset written by dask has "__null_dask_index__" index column name
    result_gpd.index.name = None
    assert_geodataframe_equal(result_gpd, df)

    result_part0 = geopandas.read_parquet(basedir / "part.0.parquet")
    result_part0.index.name = None
    assert_geodataframe_equal(result_part0, df.iloc[:45])
Пример #2
0
def test_parquet_promote_secondary_geometry(tmpdir):
    """Reading a subset of columns that does not include the primary geometry
    column should promote the first geometry column present.
    """

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))
    df["geom2"] = df.geometry.copy()

    filename = os.path.join(str(tmpdir), "test.pq")
    df.to_parquet(filename)
    pq_df = read_parquet(filename, columns=["name", "geom2"])

    assert_geodataframe_equal(
        df.set_geometry("geom2")[["name", "geom2"]], pq_df)

    df["geom3"] = df.geometry.copy()

    df.to_parquet(filename)
    with pytest.warns(
            UserWarning,
            match=
            "Multiple non-primary geometry columns read from Parquet file.",
    ):
        pq_df = read_parquet(filename, columns=["name", "geom2", "geom3"])

    assert_geodataframe_equal(
        df.set_geometry("geom2")[["name", "geom2", "geom3"]], pq_df)
Пример #3
0
    def tracts_2010(
        self, states=None,
    ):
        """Nationwide Census Tracts as drawn in 2010 (cartographic 500k).

        Parameters
        ----------
        states : list-like
            list of state fips to subset the national dataframe

        Returns
        -------
        pandas.DataFrame or geopandas.GeoDataFrame
            2010 tracts as a geodataframe or as a dataframe with geometry
            stored as well-known binary on the 'wkb' column.

        """
        try:
            t = gpd.read_parquet(pathlib.Path(data_dir, "tracts_2010_500k.parquet"))
        except Exception:
            warn(
                "streaming remote data. Use `geosnap.io.store_census() to store the data locally for better performance"
            )
            t = gpd.read_parquet(
                "s3://spatial-ucr/census/tracts_cartographic/tracts_2010_500k.parquet"
            )

        if states:
            t = t[t.geoid.str[:2].isin(states)]
        t["year"] = 2010
        return t
Пример #4
0
    def acs(self, year=2018, level="tract", states=None):
        """American Community Survey Data.

        Parameters
        ----------
        year : str
            vingage of ACS release.
        level : str
            geographic level
        states : list, optional
            subset of states (as 2-digit fips) to return 

        Returns
        -------
        geopandas.GeoDataFrame
            geodataframe of ACS data indexed by FIPS code
        """
        try:
            t = gpd.read_parquet(
                pathlib.Path(data_dir, "acs", f"acs_{year}_{level}.parquet")
            )
        except Exception:
            warn(
                "streaming remote data. Use `geosnap.io.store_acs() to store the data locally for better performance"
            )
            t = gpd.read_parquet(
                f"s3://spatial-ucr/census/acs/acs_{year}_{level}.parquet"
            )
        t = t.reset_index().rename(columns={"GEOID": "geoid"})

        if states:
            t = t[t.geoid.str[:2].isin(states)]
        t["year"] = year
        return t
Пример #5
0
def test_parquet_invalid_metadata(tmpdir, geo_meta, error):
    """Has geo metadata with missing required fields will raise a ValueError.

    This requires writing the parquet file directly below, so that we can
    control the metadata that is written for this test.
    """

    from pyarrow import parquet, Table

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    # convert to DataFrame and encode geometry to WKB
    df = DataFrame(df)
    df["geometry"] = to_wkb(df["geometry"].values)

    table = Table.from_pandas(df)
    metadata = table.schema.metadata
    metadata.update(geo_meta)
    table = table.replace_schema_metadata(metadata)

    filename = os.path.join(str(tmpdir), "test.pq")
    parquet.write_table(table, filename)

    with pytest.raises(ValueError, match=error):
        read_parquet(filename)
Пример #6
0
def test_fsspec_url():
    fsspec = pytest.importorskip("fsspec")
    import fsspec.implementations.memory

    class MyMemoryFileSystem(fsspec.implementations.memory.MemoryFileSystem):
        # Simple fsspec filesystem that adds a required keyword.
        # Attempting to use this filesystem without the keyword will raise an exception.
        def __init__(self, is_set, *args, **kwargs):
            self.is_set = is_set
            super().__init__(*args, **kwargs)

    fsspec.register_implementation("memory", MyMemoryFileSystem, clobber=True)
    memfs = MyMemoryFileSystem(is_set=True)

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    with memfs.open("data.parquet", "wb") as f:
        df.to_parquet(f)

    result = read_parquet("memory://data.parquet", storage_options=dict(is_set=True))
    assert_geodataframe_equal(result, df)

    result = read_parquet("memory://data.parquet", filesystem=memfs)
    assert_geodataframe_equal(result, df)
Пример #7
0
def _fetcher(local_path, remote_path, warning_msg):
    try:
        t = gpd.read_parquet(local_path)
    except FileNotFoundError:
        warn(warning_msg)
        t = gpd.read_parquet(remote_path, storage_options={"anon": True})

    return t
Пример #8
0
def test_parquet_columns_no_geometry(tmpdir):
    """Reading a parquet file that is missing all of the geometry columns
    should raise a ValueError"""

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    filename = os.path.join(str(tmpdir), "test.pq")
    df.to_parquet(filename)

    with pytest.raises(ValueError):
        read_parquet(filename, columns=["name"])
Пример #9
0
 def _open_dataset(self):
     """
     Open dataset using geopandas.
     """
     if self._use_fsspec:
         with fsspec.open_files(self.urlpath, **self.storage_options) as f:
             f = self._resolve_single_file(f) if len(f) > 1 else f[0]
             self._dataframe = geopandas.read_parquet(
                 f,
                 **self._geopandas_kwargs,
             )
     else:
         self._dataframe = geopandas.read_parquet(self.urlpath,
                                                  **self._geopandas_kwargs)
Пример #10
0
def test_parquet_partition_on(tmp_path, write_metadata_file):
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    ddf = dask_geopandas.from_geopandas(df, npartitions=4)

    # Writing a partitioned dataset based on one of the attribute columns
    basedir = tmp_path / "naturalearth_lowres_by_continent.parquet"
    ddf.to_parquet(basedir,
                   partition_on="continent",
                   write_metadata_file=write_metadata_file)

    # Check for one of the partitions that the file is present and is correct
    n_files = 10 if write_metadata_file else 8  # 8 continents + 2 metadata files
    assert len(list(basedir.iterdir())) == n_files
    assert (basedir / "continent=Africa").exists()
    result_africa = geopandas.read_parquet(basedir / "continent=Africa")
    expected = df[df["continent"] == "Africa"].drop(columns=["continent"])
    result_africa.index.name = None
    assert_geodataframe_equal(result_africa, expected)

    # Check roundtrip
    result = dask_geopandas.read_parquet(basedir)
    assert result.npartitions >= 8
    assert result.spatial_partitions is not None
    expected = df.copy()
    expected["continent"] = expected["continent"].astype("category")
    assert_geodataframe_equal(result.compute(), expected, check_like=True)
Пример #11
0
    def states(self):
        """States.

        Returns
        -------
        pandas.DataFrame or geopandas.GeoDataFrame
            US States as a geodataframe or as a dataframe with geometry
            stored as well-known binary on the 'wkb' column.

        """
        try:
            return gpd.read_parquet(pathlib.Path(data_dir, "states.parquet"))
        except Exception:
            return gpd.read_parquet(
                "s3://spatial-ucr/census/administrative/states.parquet"
            )
Пример #12
0
def test_write_read_parquet_expand_user():
    gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
    test_file = "~/test_file.parquet"
    gdf.to_parquet(test_file)
    pq_df = geopandas.read_parquet(test_file)
    assert_geodataframe_equal(gdf, pq_df, check_crs=True)
    os.remove(os.path.expanduser(test_file))
Пример #13
0
def download_geoparquet(file_name="my_file.parquet",
                        bucket_name="city-planning-entitlements",
                        local_path="",
                        S3_path=""):
    """
    Downloads geoparquet from S3 locally,
    read into memory as GeoDataFrame, and removes local version.

    geopandas>=0.8.0 supports initial geoparquets.

    Parameters
    ==========

    file_name: str, name of the file, such as "census_tracts.parquet"
    bucket_name: str, S3 bucket name.
    local_path: str, the local directory or folder path where the file should be stored.
                Ex: "./data/"
    S3_path: str, the S3 directory or folder path to where the file is stored in S3.
            Ex: "data/"
    """
    s3.download_file(bucket_name, f'{S3_path}{file_name}',
                     f'{local_path}{file_name}')
    gdf = gpd.read_parquet(f'{local_path}{file_name}')
    os.remove(f'{local_path}{file_name}')

    return gdf
Пример #14
0
def test_parquet_index(tmpdir):
    """Setting index=`True` should preserve index in output, and
    setting index=`False` should drop index from output.
    """

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset)).set_index("iso_a3")

    filename = os.path.join(str(tmpdir), "test_with_index.pq")
    df.to_parquet(filename, index=True)
    pq_df = read_parquet(filename)
    assert_geodataframe_equal(df, pq_df)

    filename = os.path.join(str(tmpdir), "drop_index.pq")
    df.to_parquet(filename, index=False)
    pq_df = read_parquet(filename)
    assert_geodataframe_equal(df.reset_index(drop=True), pq_df)
Пример #15
0
def test_parquet_missing_metadata2(tmpdir):
    """Missing geo metadata, such as from a parquet file created
    from a pyarrow Table (which will also not contain pandas metadata),
    will raise a ValueError.
    """
    import pyarrow.parquet as pq

    table = pyarrow.table({"a": [1, 2, 3]})
    filename = os.path.join(str(tmpdir), "test.pq")

    # use pyarrow.parquet write_table (no geo metadata, but also no pandas metadata)
    pq.write_table(table, filename)

    # missing metadata will raise ValueError
    with pytest.raises(ValueError,
                       match="Missing geo metadata in Parquet/Feather file."):
        read_parquet(filename)
Пример #16
0
def region(vector, raster, cmap='rainbow', boundary='red', band=1):
    """Quickly plot a subregion from a rasterdataset.

    The subregion is defined by the bounding box of the supplied vector.
    Colourmaps will primarily be obtained from the colorcet library.

    Parameters
    ----------
    vector : geopandas.GeoDataFrame or path_like object
    raster : rasterio.io.DatasetReader or path_like object
    cmap : colormap
        colormap for the raster data.
        Continuous data: e.g. colorwheel, rainbow, fire
        Categorical data: e.g. glasbey
    boundary : colorname
        color for the vector data

    Returns
    -------
    ax : matplotlib plot
    """
    if isinstance(vector, str):
        if Path(vector).suffix == '.parquet':
            gdf = gpd.read_parquet(vector)
        else:
            gdf = gpd.read_file(vector)
    else:
        gdf = vector

    if isinstance(raster, str):
        rast_file = rasterio.open(raster)
    else:
        rast_file = raster

    if cmap in cc.cm:
        cmap = cc.cm[cmap]

    # matplotlib and geographic packages like rasterio and geopandas use
    # different ordering conventions for their bounding box information.
    # geographic information systems (bounds): (west, south, north, east)
    # matplotlib (extent): (west, east, south, north)
    gdf_bounds = gdf.total_bounds
    gdf_extent = gdf_bounds[[0, 2, 1, 3]]

    # Subsetting raster data in rasterio is easiest to do before it is
    # read into memory (although it is possible to do so after read()).
    # Subsetting data requires a rasterio.windows.Window object to be built
    # that describes the area to focus on. There are many helper functions
    # to build windows, but the simplest is: rast_file.window
    # A window can easily be build by unpacking bounds obtained from a gdf
    rast_window = rast_file.window(*gdf_bounds)

    # Now we can read in our data within the desired region
    rast = rast_file.read(band, window=rast_window)

    plt.imshow(rast, cmap=cmap, extent=gdf_extent)
    gdf.boundary.plot(ax=plt.gca(), color=boundary)
Пример #17
0
def test_parquet_subset_columns(tmpdir):
    """Reading a subset of columns should correctly decode selected geometry
    columns.
    """

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    filename = os.path.join(str(tmpdir), "test.pq")
    df.to_parquet(filename)
    pq_df = read_parquet(filename, columns=["name", "geometry"])

    assert_geodataframe_equal(df[["name", "geometry"]], pq_df)

    with pytest.raises(
            ValueError,
            match="No geometry columns are included in the columns read"):
        read_parquet(filename, columns=[])
Пример #18
0
    def blocks_2010(self, states=None, fips=None):
        """Census blocks for 2010.

        Parameters
        ----------
        states : list-like
            list of state fips codes to return as a datafrrame.

        Returns
        -------
        type
        pandas.DataFrame or geopandas.GeoDataFrame
            2010 blocks as a geodataframe or as a dataframe with geometry
            stored as well-known binary on the 'wkb' column.

        """
        if isinstance(states, (str, int)):
            states = [states]
        blks = {}
        for state in states:
            try:
                blks[state] = gpd.read_parquet(
                    pathlib.Path(data_dir, "blocks_2010", f"{state}.parquet")
                )
            except Exception:
                warn(
                    "Unable to locate local census 2010 block data. Streaming instead.\n"
                    "If you plan to use census data repeatedly you can store it locally "
                    "with the io.store_blocks_2010 function for better performance"
                )
                blks[state] = gpd.read_parquet(
                    f"s3://spatial-ucr/census/blocks_2010/{state}.parquet"
                )

            if fips:
                blks[state] = blks[state][blks[state]["geoid"].str.startswith(fips)]

            blks[state]["year"] = 2010
        blocks = list(blks.values())
        blocks = gpd.GeoDataFrame(pd.concat(blocks, sort=True))

        return blocks
Пример #19
0
    def counties(self):
        """Nationwide counties as drawn in 2010.

        Parameters
        ----------
        convert : bool
            if True, return geodataframe, else return dataframe (the default is True).

        Returns
        -------
        geopandas.GeoDataFrame
            2010 counties as a geodataframe or as a dataframe with geometry
            stored as well-known binary on the 'wkb' column.

        """
        try:
            return gpd.read_parquet(pathlib.Path(data_dir, "counties.parquet"))
        except Exception:
            return gpd.read_parquet(
                "s3://spatial-ucr/census/administrative/counties.parquet"
            )
Пример #20
0
def main(config):
    incoming_data_path = config['paths']['incoming_data']
    processed_data_path = config['paths']['data']
    output_data_path = config['paths']['output']
    epsg_jamaica = 3448

    baseline_year = 2019
    projection_end_year = 2100
    discounting_rate = 10

    asset_data_details = pd.read_csv(
        os.path.join(processed_data_path, "networks",
                     "network_layers_hazard_intersections_details.csv"))
    asset_data_details = asset_data_details[
        asset_data_details["sector"] != "buildings"]
    hazard_asset_intersection_path = os.path.join(output_data_path,
                                                  "hazard_asset_intersection")

    flood_hazards = ["coastal", "fluvial", "surface"]
    flood_threshold = 0.5
    hazard_data_details = pd.read_csv(os.path.join(processed_data_path,
                                                   "hazards",
                                                   "hazard_layers.csv"),
                                      encoding="latin1")
    hazard_keys = hazard_data_details[hazard_data_details["hazard"].isin(
        flood_hazards)].key.values.tolist()

    for asset_info in asset_data_details.itertuples():
        asset_id = asset_info.asset_id_column
        index_columns = [asset_id, "damage_cost_unit", "hazard"]

        hazard_intersection_file = os.path.join(
            hazard_asset_intersection_path,
            f"{asset_info.asset_gpkg}_splits__hazard_layers__{asset_info.asset_layer}.geoparquet"
        )
        if os.path.isfile(hazard_intersection_file) is True:
            hazard_df = gpd.read_parquet(hazard_intersection_file)
            hazard_df = hazard_df.to_crs(epsg=epsg_jamaica)
            hazard_df = add_exposure_dimensions(
                hazard_df,
                dataframe_type=asset_info.asset_layer,
                epsg=epsg_jamaica)
            hazard_df = hazard_df[[asset_id, 'exposure', 'exposure_unit'] +
                                  hazard_keys]
            hazard_df["max_flood_depth"] = hazard_df[hazard_keys].max(axis=1)
            hazard_df = hazard_df[
                hazard_df["max_flood_depth"] > flood_threshold]

            print(
                hazard_df.sort_values(by="max_flood_depth",
                                      ascending=False).head(20))
            print(f"* Done with {asset_info.asset_gpkg}")
Пример #21
0
    def msas(self):
        """Metropolitan Statistical Areas as drawn in 2020.

        Data come from the U.S. Census Bureau's most recent TIGER/LINE files
        https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2020&layergroup=Core+Based+Statistical+Areas


        Returns
        -------
        pandas.DataFrame or geopandas.GeoDataFrame
            2010 MSAs as a geodataframe or as a dataframe with geometry
            stored as well-known binary on the 'wkb' column.

        """
        try:
            return gpd.read_parquet(pathlib.Path(data_dir, "msas.parquet")).sort_values(
                by="name"
            )
        except Exception:
            return gpd.read_parquet(
                "s3://spatial-ucr/census/administrative/msas.parquet"
            ).sort_values(by="name")
Пример #22
0
def test_parquet_repeat_columns(tmpdir):
    """Reading repeated columns should return first value of each repeated column
    """

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    filename = os.path.join(str(tmpdir), "test.pq")
    df.to_parquet(filename)

    columns = ["name", "name", "iso_a3", "name", "geometry"]
    pq_df = read_parquet(filename, columns=columns)

    assert pq_df.columns.tolist() == ["name", "iso_a3", "geometry"]
Пример #23
0
def test_parquet_compression(compression, tmpdir):
    """Using compression options should not raise errors, and should
    return identical GeoDataFrame.
    """

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    filename = os.path.join(str(tmpdir), "test.pq")
    df.to_parquet(filename, compression=compression)
    pq_df = read_parquet(filename)

    assert isinstance(pq_df, GeoDataFrame)
    assert_geodataframe_equal(df, pq_df)
Пример #24
0
def transform_sa_geometries(input_filepath: Path,
                            output_filepath: Path) -> None:
    """Transform Small Area geometries.

    Args:
        input_filepath (Path): Path to Raw Small Area Geometries Data
        output_filepath (Path): Path to Clean Small Area Geometries Data
    """
    sa_geometries = (
        gpd.read_parquet(input_filepath).pipe(extract_dublin_local_authorities)
        .to_crs("epsg:4326").loc[:, ["SMALL_AREA", "geometry"]].rename(
            columns={"SMALL_AREA": "small_area"}))

    sa_geometries.to_parquet(output_filepath)
Пример #25
0
def test_parquet_missing_metadata(tmpdir):
    """Missing geo metadata, such as from a parquet file created
    from a pandas DataFrame, will raise a ValueError.
    """

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    # convert to DataFrame
    df = DataFrame(df)

    # convert the geometry column so we can extract later
    df["geometry"] = to_wkb(df["geometry"].values)

    filename = os.path.join(str(tmpdir), "test.pq")

    # use pandas to_parquet (no geo metadata)
    df.to_parquet(filename)

    # missing metadata will raise ValueError
    with pytest.raises(ValueError,
                       match="Missing geo metadata in Parquet/Feather file."):
        read_parquet(filename)
Пример #26
0
 def map(self, *args, **kwargs):
     """
     Fetches map of `self.level` given parameters
     :param args: positional parameters for geobr map reading function
     :param kwargs: keyword parameters for geobr map reading function
     :return: GeoDataFrame
     """
     if os.path.exists(f'{self.level}_map.parquet'):
         self.mapdf = gpd.read_parquet(f'{self.level}_map.parquet')
         return self.mapdf
     print("Dowloading the Map...")
     if self.mapdf is None:
         self.mapdf = LEVELS[self.level](*args, **kwargs)
         self._persist('map')
     return self.mapdf
Пример #27
0
def test_parquet_missing_crs(tmpdir):
    """If CRS is `None`, it should be properly handled
    and remain `None` when read from parquet`.
    """

    test_dataset = "naturalearth_lowres"

    df = read_file(get_path(test_dataset))
    df.crs = None

    filename = os.path.join(str(tmpdir), "test.pq")
    df.to_parquet(filename)
    pq_df = read_parquet(filename)

    assert pq_df.crs is None

    assert_geodataframe_equal(df, pq_df, check_crs=True)
Пример #28
0
 def generate_populations(self, scale=0.05):
     """
     Generate a synthetic population of size scale*population size for each polygon in self.mapdf
     :param scale:
     """
     if os.path.exists(f"{self.level}_pop.parquet"):
         self.pop = gpd.read_parquet(f"{self.level}_pop.parquet")
         return
     if "population" not in self.mapdf.columns:
         self.demographics()
     for row in self.mapdf.itertuples():
         people = sample_random_people(int(row.population * scale), row.geometry)
         sex = np.random.randint(0, 2, size=len(people))
         age = np.random.randint(0, 100, size=len(people))
     print(len(people), people[0])
     self.pop = gpd.GeoDataFrame({"sex": sex, "age": age, "geometry": people})
     self.pop["longitude"] = [pt.x for pt in self.pop.geometry]
     self.pop["latitude"] = [pt.y for pt in self.pop.geometry]
     self._persist("pop")
Пример #29
0
def test_parquet_multiple_geom_cols(tmpdir):
    """If multiple geometry columns are present when written to parquet,
    they should all be returned as such when read from parquet.
    """

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))
    df["geom2"] = df.geometry.copy()

    filename = os.path.join(str(tmpdir), "test.pq")
    df.to_parquet(filename)

    assert os.path.exists(filename)

    pq_df = read_parquet(filename)

    assert isinstance(pq_df, GeoDataFrame)
    assert_geodataframe_equal(df, pq_df)

    assert_geoseries_equal(df.geom2, pq_df.geom2, check_geom_type=True)
Пример #30
0
        def cache_function(*args, **kwargs):
            if not isinstance(cache_dir, Path):
                raise TypeError('cache_dir should be a pathlib.Path object')

            cache_file = cache_dir / (func.__name__ + '.trc.pqt')

            if hard_reset or (not cache_file.exists()):
                result = func(*args, **kwargs)
                if not isinstance(result, pd.DataFrame):
                    raise TypeError(
                        f"The result of computing {func.__name__} is not a DataFrame"
                    )
                result.to_parquet(cache_file)
                return result
            print("{} exist".format(cache_file.name))
            if geoformat:
                import geopandas as gpd
                result = gpd.read_parquet(cache_file)
            else:
                result = pd.read_parquet(cache_file)
            return result