Exemplo n.º 1
0
    def init(cls, eltype, data, kdims, vdims):
        import pandas as pd
        from spatialpandas import GeoDataFrame, GeoSeries

        if kdims is None:
            kdims = eltype.kdims

        if vdims is None:
            vdims = eltype.vdims

        if isinstance(data, GeoSeries):
            data = data.to_frame()

        if 'geopandas' in sys.modules:
            import geopandas as gpd
            if isinstance(data, gpd.GeoSeries):
                data = data.to_frame()
            if isinstance(data, gpd.GeoDataFrame):
                data = GeoDataFrame(data)
        if isinstance(data, list):
            if 'shapely' in sys.modules:
                data = from_shapely(data)
            if isinstance(data, list):
                data = from_multi(eltype, data, kdims, vdims)
        elif not isinstance(data, GeoDataFrame):
            raise ValueError(
                "SpatialPandasInterface only support spatialpandas DataFrames."
            )
        elif 'geometry' not in data:
            cls.geo_column(data)

        index_names = data.index.names if isinstance(
            data, pd.DataFrame) else [data.index.name]
        if index_names == [None]:
            index_names = ['index']

        for kd in kdims + vdims:
            kd = dimension_name(kd)
            if kd in data.columns:
                continue
            if any(kd == ('index' if name is None else name)
                   for name in index_names):
                data = data.reset_index()
                break

        return data, {'kdims': kdims, 'vdims': vdims}, {}
Exemplo n.º 2
0
def test_multipoint_cx_frame_selection(gp_multipoint, rect):
    x0, y0, x1, y1 = rect
    expected = GeoDataFrame(
        GeoSeries(gp_multipoint.cx[x0:x1, y0:y1], dtype='multipoint'))

    sp_multipoint = GeoSeries(gp_multipoint).to_frame()
    result = sp_multipoint.cx[x0:x1, y0:y1]
    assert_frame_equal(expected, result, obj='GeoDataFrame')
Exemplo n.º 3
0
def test_multipoint_cx_frame_selection_dask(gp_multipoint, rect):
    x0, y0, x1, y1 = rect
    expected = GeoDataFrame(
        GeoSeries(gp_multipoint.cx[x0:x1, y0:y1], dtype='multipoint')
    )

    sp_multipoint = dd.from_pandas(GeoSeries(gp_multipoint).to_frame(), npartitions=3)
    result = sp_multipoint.cx[x0:x1, y0:y1].compute()
    assert_frame_equal(expected, result, obj='GeoDataFrame')
Exemplo n.º 4
0
def read_parquet(path, columns=None):
    # Load using standard pandas read_parquet
    result = pd_read_parquet(path, engine="auto", columns=columns)

    # Import geometry columns, not needed for pyarrow >= 0.16
    metadata = _load_parquet_pandas_metadata(path)
    geom_cols = _get_geometry_columns(metadata)
    if geom_cols:
        result = _import_geometry_columns(result, geom_cols)

    # Return result
    return GeoDataFrame(result)
Exemplo n.º 5
0
def load_kinsa():
    print('Loading Kinsa Data')
    county_df = gpd.read_file(path.expanduser('~/kinsa_geometry.gpkg'),
                              layer='county')
    oneday_df = ddf.read_csv('s3://makepath-demo/kinsa/one_day.csv').compute()
    oneday_df['region_id'] = oneday_df['region_id'].astype(str).str.zfill(5)
    county_df['GEOID'] = county_df['GEOID'].astype(str).str.zfill(5)

    fields = ['GEOID', 'geometry', 'forecast_upper']
    county_df = county_df.join(oneday_df.set_index('region_id'),
                               on='GEOID')[fields]
    return GeoDataFrame(county_df, geometry='geometry')
Exemplo n.º 6
0
def test_pack_partitions_to_parquet(gp_multipoint, gp_multiline,
                                    use_temp_format, tmp_path_factory):
    with tmp_path_factory.mktemp("spatialpandas", numbered=True) as tmp_path:
        # Build dataframe
        n = min(len(gp_multipoint), len(gp_multiline))
        df = GeoDataFrame({
            'points': GeoSeries(gp_multipoint[:n]),
            'lines': GeoSeries(gp_multiline[:n]),
            'a': list(range(n))
        }).set_geometry('lines')
        ddf = dd.from_pandas(df, npartitions=3)

        path = tmp_path / 'ddf.parq'
        if use_temp_format:
            (tmp_path / 'scratch').mkdir(parents=True, exist_ok=True)
            tempdir_format = str(tmp_path / 'scratch' /
                                 'part-{uuid}-{partition:03d}')
        else:
            tempdir_format = None

        _retry_args = dict(wait_exponential_multiplier=10,
                           wait_exponential_max=20000,
                           stop_max_attempt_number=4)

        ddf_packed = ddf.pack_partitions_to_parquet(
            str(path),
            npartitions=12,
            tempdir_format=tempdir_format,
            _retry_args=_retry_args,
        )

        # Check the number of partitions (< 4 can happen in the case of empty partitions)
        assert ddf_packed.npartitions <= 12

        # Check that rows are now sorted in order of hilbert distance
        total_bounds = df.lines.total_bounds
        hilbert_distances = ddf_packed.lines.map_partitions(
            lambda s: s.hilbert_distance(total_bounds=total_bounds)).compute(
            ).values

        # Compute expected total_bounds
        expected_distances = np.sort(
            df.lines.hilbert_distance(total_bounds=total_bounds).values)

        np.testing.assert_equal(expected_distances, hilbert_distances)
        assert ddf_packed.geometry.name == 'points'

        # Read columns
        columns = ['a', 'lines']
        ddf_read_cols = read_parquet_dask(path, columns=columns)
        pd.testing.assert_frame_equal(ddf_read_cols.compute(),
                                      ddf_packed[columns].compute())
Exemplo n.º 7
0
    def split(cls, dataset, start, end, datatype, **kwargs):
        from spatialpandas import GeoDataFrame, GeoSeries
        from ...element import Polygons

        objs = []
        if not len(dataset.data):
            return []
        xdim, ydim = cls.geom_dims(dataset)
        value_dims = [
            dim for dim in dataset.kdims + dataset.vdims
            if dim not in (xdim, ydim)
        ]
        row = dataset.data.iloc[0]
        col = cls.geo_column(dataset.data)
        geom_type = cls.geom_type(dataset)
        if datatype is not None:
            arr = geom_to_array(row[col], geom_type=geom_type)
            d = {(xdim.name, ydim.name): arr}
            d.update({dim.name: row[dim.name] for dim in value_dims})
            ds = dataset.clone(d, datatype=['dictionary'])

        holes = cls.holes(dataset) if cls.has_holes(dataset) else None
        for i, row in dataset.data.iterrows():
            if datatype is None:
                gdf = GeoDataFrame({
                    c: GeoSeries([row[c]]) if c == 'geometry' else [row[c]]
                    for c in dataset.data.columns
                })
                objs.append(dataset.clone(gdf))
                continue

            geom = row[col]
            gt = geom_type or get_geom_type(dataset.data, col)
            arr = geom_to_array(geom, geom_type=gt)
            d = {xdim.name: arr[:, 0], ydim.name: arr[:, 1]}
            d.update({dim.name: row[dim.name] for dim in value_dims})
            if datatype in ('dictionary', 'columns'):
                if holes is not None:
                    d[Polygons._hole_key] = holes[i]
                d['geom_type'] = gt
                objs.append(d)
                continue

            ds.data = d
            if datatype == 'array':
                obj = ds.array(**kwargs)
            elif datatype == 'dataframe':
                obj = ds.dframe(**kwargs)
            else:
                raise ValueError("%s datatype not support" % datatype)
            objs.append(obj)
        return objs
Exemplo n.º 8
0
def test_dataframe_slice_types():
    gdf = GeoDataFrame({
        'a': [3, 2, 1],
        'b': [10, 11, 12],
        'points':
        pd.array([[0, 0, 1, 1], [2, 2, 3, 3], [4, 4]], dtype='multipoint'),
        'line':
        pd.array([[0, 0, 1, 1], [2, 2, 3, 3], [4, 4]], dtype='line'),
    })

    assert isinstance(gdf['a'], pd.Series)
    assert isinstance(gdf['points'], GeoSeries)
    assert isinstance(gdf[['a', 'b']], pd.DataFrame)
    assert isinstance(gdf[['a', 'line']], GeoDataFrame)
Exemplo n.º 9
0
def test_parquet(gp_point, gp_multipoint, gp_multiline, tmp_path):
    # Build dataframe
    n = min(len(gp_multipoint), len(gp_multiline))
    df = GeoDataFrame({
        'point': GeoSeries(gp_point[:n]),
        'multipoint': GeoSeries(gp_multipoint[:n]),
        'multiline': GeoSeries(gp_multiline[:n]),
        'a': list(range(n))
    })

    path = tmp_path / 'df.parq'
    to_parquet(df, path)
    df_read = read_parquet(path)
    assert isinstance(df_read, GeoDataFrame)
    assert all(df == df_read)
Exemplo n.º 10
0
def test_parquet_columns(gp_point, gp_multipoint, gp_multiline, tmp_path):
    # Build dataframe
    n = min(len(gp_multipoint), len(gp_multiline))
    df = GeoDataFrame({
        'point': GeoSeries(gp_point[:n]),
        'multipoint': GeoSeries(gp_multipoint[:n]),
        'multiline': GeoSeries(gp_multiline[:n]),
        'a': list(range(n))
    })

    path = tmp_path / 'df.parq'
    to_parquet(df, path)
    columns = ['a', 'multiline']
    df_read = read_parquet(str(path), columns=columns)
    assert isinstance(df_read, GeoDataFrame)
    pd.testing.assert_frame_equal(df[columns], df_read)
Exemplo n.º 11
0
def _to_spatialpandas(
    column: List[Union[int, float]],
    polygon_points: List[np.ndarray],
    column_name: str,
):
    from spatialpandas import GeoDataFrame
    from spatialpandas.geometry import PolygonArray

    # spatialpandas expects 1d numpy arrays.
    for i, arrays in enumerate(polygon_points):
        polygon_points[i] = \
            list(map(lambda array: np.reshape(array, -1), arrays))

    df = GeoDataFrame({
        column_name: column,
        "geometry": PolygonArray(polygon_points)
    })
    return df
Exemplo n.º 12
0
def test_pack_partitions_to_parquet(gp_multipoint, gp_multiline,
                                    use_temp_format, tmp_path):
    # Build dataframe
    n = min(len(gp_multipoint), len(gp_multiline))
    df = GeoDataFrame({
        'points': GeoSeries(gp_multipoint[:n]),
        'lines': GeoSeries(gp_multiline[:n]),
        'a': list(range(n))
    }).set_geometry('lines')
    ddf = dd.from_pandas(df, npartitions=3)

    path = tmp_path / 'ddf.parq'
    if use_temp_format:
        tempdir_format = str(tmp_path / 'scratch' /
                             'part-{uuid}-{partition:03d}')
    else:
        tempdir_format = None

    ddf_packed = ddf.pack_partitions_to_parquet(path,
                                                npartitions=12,
                                                tempdir_format=tempdir_format)

    # Check the number of partitions (< 4 can happen in the case of empty partitions)
    assert ddf_packed.npartitions <= 12

    # Check that rows are now sorted in order of hilbert distance
    total_bounds = df.lines.total_bounds
    hilbert_distances = ddf_packed.lines.map_partitions(
        lambda s: s.hilbert_distance(total_bounds=total_bounds)).compute(
        ).values

    # Compute expected total_bounds
    expected_distances = np.sort(
        df.lines.hilbert_distance(total_bounds=total_bounds).values)

    np.testing.assert_equal(expected_distances, hilbert_distances)
    assert ddf_packed.geometry.name == 'points'

    # Read columns
    columns = ['a', 'lines']
    ddf_read_cols = read_parquet_dask(path,
                                      columns=columns + ['hilbert_distance'])
    pd.testing.assert_frame_equal(ddf_read_cols.compute(),
                                  ddf_packed[columns].compute())
Exemplo n.º 13
0
def test_parquet(gp_point, gp_multipoint, gp_multiline, tmp_path):
    # Build dataframe
    n = min(len(gp_multipoint), len(gp_multiline))
    df = GeoDataFrame({
        'point': GeoSeries(gp_point[:n]),
        'multipoint': GeoSeries(gp_multipoint[:n]),
        'multiline': GeoSeries(gp_multiline[:n]),
        'a': list(range(n))
    })

    df.index.name = 'range_idx'

    path = tmp_path / 'df.parq'
    to_parquet(df, path)
    df_read = read_parquet(str(path),
                           columns=['point', 'multipoint', 'multiline', 'a'])
    assert isinstance(df_read, GeoDataFrame)
    pd.testing.assert_frame_equal(df, df_read)
    assert df_read.index.name == df.index.name
Exemplo n.º 14
0
def test_parquet_dask(gp_multipoint, gp_multiline, tmp_path_factory):
    with tmp_path_factory.mktemp("spatialpandas", numbered=True) as tmp_path:
        # Build dataframe
        n = min(len(gp_multipoint), len(gp_multiline))
        df = GeoDataFrame({
            'points': GeoSeries(gp_multipoint[:n]),
            'lines': GeoSeries(gp_multiline[:n]),
            'a': list(range(n))
        })
        ddf = dd.from_pandas(df, npartitions=3)

        path = tmp_path / 'ddf.parq'
        ddf.to_parquet(str(path))
        ddf_read = read_parquet_dask(str(path))

        # Check type
        assert isinstance(ddf_read, DaskGeoDataFrame)

        # Check that partition bounds were loaded
        nonempty = np.nonzero(
            np.asarray(ddf.map_partitions(len).compute() > 0))[0]
        assert set(ddf_read._partition_bounds) == {'points', 'lines'}
        expected_partition_bounds = (
            ddf['points'].partition_bounds.iloc[nonempty].reset_index(
                drop=True))
        expected_partition_bounds.index.name = 'partition'

        pd.testing.assert_frame_equal(
            expected_partition_bounds,
            ddf_read._partition_bounds['points'],
        )

        expected_partition_bounds = (
            ddf['lines'].partition_bounds.iloc[nonempty].reset_index(
                drop=True))
        expected_partition_bounds.index.name = 'partition'
        pd.testing.assert_frame_equal(
            expected_partition_bounds,
            ddf_read._partition_bounds['lines'],
        )

        assert ddf_read.geometry.name == 'points'
Exemplo n.º 15
0
def test_active_geometry(use_dask):
    gdf = GeoDataFrame(
        OrderedDict([
            ('a', [3, 2, 1]),
            ('points',
             pd.array([[0, 0, 1, 1], [2, 2, 3, 3], [4, 4]],
                      dtype='multipoint')),
            ('line',
             pd.array([[0, 0, 1, 1], [2, 2, 3, 3], [4, 4]], dtype='line')),
        ]))

    if use_dask:
        gdf = dd.from_pandas(gdf, npartitions=2)

    # geometry start out as first compatible column in data frame
    assert gdf.geometry.name == 'points'

    # set_geometry default to copy operation
    assert gdf.set_geometry('line').geometry.name == 'line'
    assert gdf.geometry.name == 'points'

    # set_geometry inplace mutates geometry column
    if use_dask:
        # inplace not supported for DaskGeoDataFrame
        gdf = gdf.set_geometry('line')
    else:
        gdf.set_geometry('line', inplace=True)
    assert gdf.geometry.name == 'line'

    # Activate geometry propagates through slicing
    sliced_gdf = gdf.loc[[0, 2, 1, 0]]
    assert isinstance(sliced_gdf, type(gdf))
    assert sliced_gdf.geometry.name == 'line'

    # Select columns not including active geometry
    selected_gdf = gdf[['a', 'points']]
    with pytest.raises(ValueError):
        selected_gdf.geometry

    assert selected_gdf.set_geometry('points').geometry.name == 'points'
Exemplo n.º 16
0
def test_pack_partitions_to_parquet_list_bounds(
    gp_multipoint1,
    gp_multiline1,
    gp_multipoint2,
    gp_multiline2,
    bounds,
    tmp_path,
):
    # Build dataframe1
    n = min(len(gp_multipoint1), len(gp_multiline1))
    df1 = GeoDataFrame({
        'points': GeoSeries(gp_multipoint1[:n]),
        'lines': GeoSeries(gp_multiline1[:n]),
        'a': list(range(n))
    }).set_geometry('lines')
    ddf1 = dd.from_pandas(df1, npartitions=3)
    path1 = tmp_path / 'ddf1.parq'
    ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3)

    # Build dataframe2
    n = min(len(gp_multipoint2), len(gp_multiline2))
    df2 = GeoDataFrame({
        'points': GeoSeries(gp_multipoint2[:n]),
        'lines': GeoSeries(gp_multiline2[:n]),
        'a': list(range(n))
    }).set_geometry('lines')
    ddf2 = dd.from_pandas(df2, npartitions=3)
    path2 = tmp_path / 'ddf2.parq'
    ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4)

    # Load both packed datasets with glob
    ddf_read = read_parquet_dask(
        [str(tmp_path / "ddf1.parq"),
         str(tmp_path / "ddf2.parq")],
        geometry="points",
        bounds=bounds)

    # Check the number of partitions (< 7 can happen in the case of empty partitions)
    assert ddf_read.npartitions <= 7

    # Check contents
    xslice = slice(bounds[0], bounds[2])
    yslice = slice(bounds[1], bounds[3])
    expected_df = pd.concat([
        ddf_packed1.cx_partitions[xslice, yslice].compute(),
        ddf_packed2.cx_partitions[xslice, yslice].compute()
    ])
    df_read = ddf_read.compute()
    pd.testing.assert_frame_equal(df_read, expected_df)

    # Compute expected partition bounds
    points_bounds = pd.concat([
        ddf_packed1._partition_bounds['points'],
        ddf_packed2._partition_bounds['points'],
    ]).reset_index(drop=True)

    x0, y0, x1, y1 = bounds
    x0, x1 = (x0, x1) if x0 <= x1 else (x1, x0)
    y0, y1 = (y0, y1) if y0 <= y1 else (y1, y0)
    partition_inds = ~((points_bounds.x1 < x0) | (points_bounds.y1 < y0) |
                       (points_bounds.x0 > x1) | (points_bounds.y0 > y1))
    points_bounds = points_bounds[partition_inds].reset_index(drop=True)

    lines_bounds = pd.concat([
        ddf_packed1._partition_bounds['lines'],
        ddf_packed2._partition_bounds['lines'],
    ]).reset_index(drop=True)[partition_inds].reset_index(drop=True)
    points_bounds.index.name = 'partition'
    lines_bounds.index.name = 'partition'

    # Check partition bounds
    pd.testing.assert_frame_equal(points_bounds,
                                  ddf_read._partition_bounds['points'])

    pd.testing.assert_frame_equal(lines_bounds,
                                  ddf_read._partition_bounds['lines'])

    # Check active geometry column
    assert ddf_read.geometry.name == 'points'
Exemplo n.º 17
0
def _perform_read_parquet_dask(
    paths,
    columns,
    filesystem,
    load_divisions,
    geometry=None,
    bounds=None,
    categories=None,
):
    filesystem = validate_coerce_filesystem(paths[0], filesystem)
    datasets = [
        pa.parquet.ParquetDataset(path,
                                  filesystem=filesystem,
                                  validate_schema=False) for path in paths
    ]

    # Create delayed partition for each piece
    pieces = []
    for dataset in datasets:
        # Perform natural sort on pieces so that "part.10" comes after "part.2"
        dataset_pieces = sorted(dataset.pieces,
                                key=lambda piece: natural_sort_key(piece.path))
        pieces.extend(dataset_pieces)

    delayed_partitions = [
        delayed(read_parquet)(piece.path,
                              columns=columns,
                              filesystem=filesystem) for piece in pieces
    ]

    # Load divisions
    if load_divisions:
        div_mins_list, div_maxes_list = zip(
            *[_load_divisions(dataset) for dataset in datasets])

        div_mins = reduce(lambda a, b: a + b, div_mins_list, [])
        div_maxes = reduce(lambda a, b: a + b, div_maxes_list, [])
    else:
        div_mins = None
        div_maxes = None

    # load partition bounds
    partition_bounds_list = [
        _load_partition_bounds(dataset) for dataset in datasets
    ]
    if not any([b is None for b in partition_bounds_list]):
        partition_bounds = {}
        # We have partition bounds for all datasets
        for partition_bounds_el in partition_bounds_list:
            for col, col_bounds in partition_bounds_el.items():
                col_bounds_list = partition_bounds.get(col, [])
                col_bounds_list.append(col_bounds)
                partition_bounds[col] = col_bounds_list

        # Concat bounds for each geometry column
        for col in list(partition_bounds):
            partition_bounds[col] = pd.concat(partition_bounds[col],
                                              axis=0).reset_index(drop=True)
            partition_bounds[col].index.name = 'partition'
    else:
        partition_bounds = {}

    # Use Dask's read_parquet to get metadata
    if columns is not None:
        cols_no_index = [col for col in columns if col != "hilbert_distance"]
    else:
        cols_no_index = None

    meta = dd_read_parquet(
        paths[0],
        columns=cols_no_index,
        filesystem=filesystem,
        engine='pyarrow',
        categories=categories,
        gather_statistics=False,
    )._meta

    # Import geometry columns in meta, not needed for pyarrow >= 0.16
    metadata = _load_parquet_pandas_metadata(paths[0], filesystem=filesystem)
    geom_cols = _get_geometry_columns(metadata)
    if geom_cols:
        meta = _import_geometry_columns(meta, geom_cols)
    meta = GeoDataFrame(meta)

    # Handle geometry in meta
    if geometry:
        meta = meta.set_geometry(geometry)

    geometry = meta.geometry.name

    # Filter partitions by bounding box
    if bounds and geometry in partition_bounds:
        # Unpack bounds coordinates and make sure
        x0, y0, x1, y1 = bounds

        # Make sure x0 < c1
        if x0 > x1:
            x0, x1 = x1, x0

        # Make sure y0 < y1
        if y0 > y1:
            y0, y1 = y1, y0

        # Make DataFrame with bounds and parquet piece
        partitions_df = partition_bounds[geometry].assign(
            delayed_partition=delayed_partitions)

        if load_divisions:
            partitions_df = partitions_df.assign(div_mins=div_mins,
                                                 div_maxes=div_maxes)

        inds = ~((partitions_df.x1 < x0) | (partitions_df.y1 < y0) |
                 (partitions_df.x0 > x1) | (partitions_df.y0 > y1))

        partitions_df = partitions_df[inds]
        for col in list(partition_bounds):
            partition_bounds[col] = partition_bounds[col][inds]
            partition_bounds[col].reset_index(drop=True, inplace=True)
            partition_bounds[col].index.name = "partition"

        delayed_partitions = partitions_df.delayed_partition.tolist()
        if load_divisions:
            div_mins = partitions_df.div_mins
            div_maxes = partitions_df.div_maxes

    if load_divisions:
        divisions = div_mins + [div_maxes[-1]]
        if divisions != sorted(divisions):
            raise ValueError(
                "Cannot load divisions because the discovered divisions are unsorted.\n"
                "Set load_divisions=False to skip loading divisions.")
    else:
        divisions = None

    # Create DaskGeoDataFrame
    if delayed_partitions:
        result = from_delayed(delayed_partitions,
                              divisions=divisions,
                              meta=meta,
                              verify_meta=False)
    else:
        # Single partition empty result
        result = from_pandas(meta, npartitions=1)

    # Set partition bounds
    if partition_bounds:
        result._partition_bounds = partition_bounds

    return result
Exemplo n.º 18
0
def to_spatialpandas(data, xdim, ydim, columns=[], geom='point'):
    """Converts list of dictionary format geometries to spatialpandas line geometries.

    Args:
        data: List of dictionaries representing individual geometries
        xdim: Name of x-coordinates column
        ydim: Name of y-coordinates column
        columns: List of columns to add
        geom: The type of geometry

    Returns:
        A spatialpandas.GeoDataFrame version of the data
    """
    from spatialpandas import GeoSeries, GeoDataFrame
    from spatialpandas.geometry import (Point, Line, Polygon, Ring, LineArray,
                                        PolygonArray, PointArray,
                                        MultiLineArray, MultiPolygonArray,
                                        MultiPointArray, RingArray)
    from ...element import Polygons
    poly = any(Polygons._hole_key in d for d in data) or geom == 'Polygon'
    if poly:
        geom_type = Polygon
        single_array, multi_array = PolygonArray, MultiPolygonArray
    elif geom == 'Line':
        geom_type = Line
        single_array, multi_array = LineArray, MultiLineArray
    elif geom == 'Ring':
        geom_type = Ring
        single_array, multi_array = RingArray, MultiLineArray
    else:
        geom_type = Point
        single_array, multi_array = PointArray, MultiPointArray

    array_type = None
    hole_arrays, geom_arrays = [], []
    for geom in data:
        geom = dict(geom)
        if xdim not in geom or ydim not in geom:
            raise ValueError('Could not find geometry dimensions')
        xs, ys = geom.pop(xdim), geom.pop(ydim)
        xscalar, yscalar = isscalar(xs), isscalar(ys)
        if xscalar and yscalar:
            xs, ys = np.array([xs]), np.array([ys])
        elif xscalar:
            xs = np.full_like(ys, xs)
        elif yscalar:
            ys = np.full_like(xs, ys)
        geom_array = np.column_stack([xs, ys])

        if geom_type in (Polygon, Ring):
            geom_array = ensure_ring(geom_array)

        splits = np.where(
            np.isnan(geom_array[:, :2].astype('float')).sum(axis=1))[0]
        split_geoms = np.split(geom_array, splits +
                               1) if len(splits) else [geom_array]
        split_holes = geom.pop(Polygons._hole_key, None)
        if split_holes is not None:
            if len(split_holes) != len(split_geoms):
                raise DataError(
                    'Polygons with holes containing multi-geometries '
                    'must declare a list of holes for each geometry.',
                    SpatialPandasInterface)
            else:
                split_holes = [[ensure_ring(np.asarray(h)) for h in hs]
                               for hs in split_holes]

        geom_arrays.append(split_geoms)
        hole_arrays.append(split_holes)
        if geom_type is Point:
            if len(splits) > 1 or any(len(g) > 1 for g in split_geoms):
                array_type = multi_array
            elif array_type is None:
                array_type = single_array
        elif len(splits):
            array_type = multi_array
        elif array_type is None:
            array_type = single_array

    converted = defaultdict(list)
    for geom, arrays, holes in zip(data, geom_arrays, hole_arrays):
        parts = []
        for i, g in enumerate(arrays):
            if i != (len(arrays) - 1):
                g = g[:-1]
            if len(g) < (3 if poly else 2) and geom_type is not Point:
                continue
            if poly:
                parts.append([])
                subparts = parts[-1]
            else:
                subparts = parts
            subparts.append(g[:, :2])
            if poly and holes is not None:
                subparts += [np.array(h) for h in holes[i]]

        for c, v in geom.items():
            converted[c].append(v)

        if array_type is PointArray:
            parts = parts[0].flatten()
        elif array_type is MultiPointArray:
            parts = np.concatenate([sp.flatten() for sp in parts])
        elif array_type is multi_array:
            parts = [[ssp.flatten() for ssp in sp] if poly else sp.flatten()
                     for sp in parts]
        else:
            parts = [np.asarray(sp).flatten()
                     for sp in parts[0]] if poly else parts[0].flatten()
        converted['geometry'].append(parts)

    if converted:
        geometries = converted['geometry']
        if array_type is PointArray:
            geometries = np.concatenate(geometries)
        geom_array = array_type(geometries)
        if poly:
            geom_array = geom_array.oriented()
        converted['geometry'] = GeoSeries(geom_array)
    else:
        converted['geometry'] = GeoSeries(single_array([]))
    return GeoDataFrame(converted, columns=['geometry'] + columns)
Exemplo n.º 19
0
def dask_GeoDataFrame(*args, **kwargs):
    return dd.from_pandas(GeoDataFrame(*args, **kwargs), npartitions=3)
Exemplo n.º 20
0
def append_traj_info(
    df: spd.GeoDataFrame,
    gdfs: Dict[str, gpd.GeoDataFrame],
) -> spd.GeoDataFrame:
    """Append trajectory info from shape files.

    Parameters
    ----------
    df
        DataFrame containing trajectories.
    gdfs
        Dict containing the following GeoPandas GeoDataFrames:
            - tracts
            - tsz
            - city
            - county

    Returns
    -------
    Dataframe with appended information.

    """
    df = df.reset_index(drop=True)
    start_locations = gpd.GeoDataFrame(
        geometry=df.start_geohash.apply(geohash_decode_point),
        index=df.index,
        crs="EPSG:4326",
    )
    end_locations = gpd.GeoDataFrame(
        geometry=df.end_geohash.apply(geohash_decode_point),
        index=df.index,
        crs="EPSG:4326",
    )

    tracts = gdfs["tracts"].to_crs("EPSG:4326")
    df["start_CensusBlock2019"] = (
        gpd.sjoin(
            start_locations,
            tracts[["geometry", "GEOID"]].set_index("GEOID"),
            how="left",
        ).rename(columns={
            "index_right": "start_CensusBlock2019"
        }).drop(columns=["geometry"]).astype(str).fillna("unknown")
        ["start_CensusBlock2019"].pipe(lambda s: s.groupby(s.index).head(1)))
    df["end_CensusBlock2019"] = (
        gpd.sjoin(
            end_locations,
            tracts[["geometry", "GEOID"]].set_index("GEOID"),
            how="left",
        ).rename(columns={
            "index_right": "end_CensusBlock2019"
        }).drop(columns=["geometry"]).astype(str).fillna("unknown")
        ["end_CensusBlock2019"].pipe(lambda s: s.groupby(s.index).head(1)))

    tsz = gdfs["tsz"].to_crs("EPSG:4326")
    df["start_TSZ"] = (
        gpd.sjoin(
            start_locations,
            tsz[["geometry", "TSZ"]].set_index("TSZ"),
            how="left",
        ).rename(columns={
            "index_right": "start_TSZ"
        }).drop(columns=["geometry"]).astype(str).fillna("Out of NCTCOG area")
        ["start_TSZ"].pipe(lambda s: s.groupby(s.index).head(1)))
    df["end_TSZ"] = (gpd.sjoin(
        end_locations,
        tsz[["geometry", "TSZ"]].set_index("TSZ"),
        how="left",
    ).rename(columns={
        "index_right": "end_TSZ"
    }).drop(columns=["geometry"]).astype(str).fillna("Out of NCTCOG area")
                     ["end_TSZ"].pipe(lambda s: s.groupby(s.index).head(1)))

    county = gdfs["county"]  # File already in EPSG:4326
    df["start_county"] = (gpd.sjoin(
        start_locations,
        county[["geometry", "CNTY_NM"]].set_index("CNTY_NM"),
        how="left",
    ).rename(columns={
        "index_right": "start_county"
    }).drop(columns=["geometry"]).fillna("unknown")["start_county"].pipe(
        lambda s: s.groupby(s.index).head(1)))
    df["end_county"] = (gpd.sjoin(
        end_locations,
        county[["geometry", "CNTY_NM"]].set_index("CNTY_NM"),
        how="left",
    ).rename(columns={
        "index_right": "end_county"
    }).drop(columns=["geometry"]).fillna("unknown")["end_county"].pipe(
        lambda s: s.groupby(s.index).head(1)))

    city = gdfs["city"]  # File already in EPSG:4326
    df["start_city"] = (gpd.sjoin(
        start_locations,
        city[["geometry", "CITY_NM"]].set_index("CITY_NM"),
        how="left",
    ).rename(columns={
        "index_right": "start_city"
    }).drop(columns=["geometry"]).fillna("unknown")["start_city"].pipe(
        lambda s: s.groupby(s.index).head(1)))
    df["end_city"] = (gpd.sjoin(
        end_locations,
        city[["geometry", "CITY_NM"]].set_index("CITY_NM"),
        how="left",
    ).rename(columns={
        "index_right": "end_city"
    }).drop(columns=["geometry"]).fillna("unknown")["end_city"].pipe(
        lambda s: s.groupby(s.index).head(1)))

    return df
Exemplo n.º 21
0
def test_pack_partitions_to_parquet_glob(gp_multipoint1, gp_multiline1,
                                         gp_multipoint2, gp_multiline2,
                                         tmp_path_factory):
    with tmp_path_factory.mktemp("spatialpandas", numbered=True) as tmp_path:
        # Build dataframe1
        n = min(len(gp_multipoint1), len(gp_multiline1))
        df1 = GeoDataFrame({
            'points': GeoSeries(gp_multipoint1[:n]),
            'lines': GeoSeries(gp_multiline1[:n]),
            'a': list(range(n))
        }).set_geometry('lines')
        ddf1 = dd.from_pandas(df1, npartitions=3)
        path1 = tmp_path / 'ddf1.parq'
        ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1),
                                                      npartitions=3)

        # Build dataframe2
        n = min(len(gp_multipoint2), len(gp_multiline2))
        df2 = GeoDataFrame({
            'points': GeoSeries(gp_multipoint2[:n]),
            'lines': GeoSeries(gp_multiline2[:n]),
            'a': list(range(n))
        }).set_geometry('lines')
        ddf2 = dd.from_pandas(df2, npartitions=3)
        path2 = tmp_path / 'ddf2.parq'
        ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2),
                                                      npartitions=4)

        # Load both packed datasets with glob
        ddf_globbed = read_parquet_dask(tmp_path / "ddf*.parq",
                                        geometry="lines")

        # Check the number of partitions (< 7 can happen in the case of empty partitions)
        assert ddf_globbed.npartitions <= 7

        # Check contents
        expected_df = pd.concat([ddf_packed1.compute(), ddf_packed2.compute()])
        df_globbed = ddf_globbed.compute()
        pd.testing.assert_frame_equal(df_globbed, expected_df)

        # Check partition bounds
        expected_bounds = {
            'points':
            pd.concat([
                ddf_packed1._partition_bounds['points'],
                ddf_packed2._partition_bounds['points'],
            ]).reset_index(drop=True),
            'lines':
            pd.concat([
                ddf_packed1._partition_bounds['lines'],
                ddf_packed2._partition_bounds['lines'],
            ]).reset_index(drop=True),
        }
        expected_bounds['points'].index.name = 'partition'
        expected_bounds['lines'].index.name = 'partition'
        pd.testing.assert_frame_equal(expected_bounds['points'],
                                      ddf_globbed._partition_bounds['points'])

        pd.testing.assert_frame_equal(expected_bounds['lines'],
                                      ddf_globbed._partition_bounds['lines'])

        assert ddf_globbed.geometry.name == 'lines'
Exemplo n.º 22
0
def _sjoin_pandas_pandas(
        left_df, right_df, how="inner", op="intersects",
        lsuffix="left", rsuffix="right"
):
    from spatialpandas import GeoDataFrame

    # Record original index name(s), generate new index name(s), reset index column(s)
    original_right_df = right_df
    original_left_df = left_df
    right_df, right_index_name, index_right = _record_reset_index(
        original_right_df, rsuffix
    )
    left_df, left_index_name, index_left = _record_reset_index(
        original_left_df, lsuffix
    )

    if any(original_left_df.columns.isin(index_left + index_right)) or any(
            original_right_df.columns.isin(index_left + index_right)
    ):
        raise ValueError(
            "'{0}' and '{1}' cannot be column names in the GeoDataFrames being"
            " joined".format(index_left, index_right)
        )

    # Get spatial index for left frame
    sindex = left_df.geometry.sindex
    left_geom = left_df.geometry.array
    right_geom = right_df.geometry.array

    # Get bounds from right geometry
    right_bounds = right_df.geometry.bounds.values

    # Init list of arrays, the same length as right_df, where each array holds the
    # indices into left_df that intersect with the corresponding element.
    left_inds = [np.array([], dtype='uint32')] * len(right_df)

    # right_inds will hold the inds into right_df that correspond to left_inds
    right_inds = [np.array([], dtype='uint32')] * len(right_df)

    # Loop over the right frame
    for i in range(len(right_df)):
        # Get bounds for shape in current row of right_df
        shape_bounds = right_bounds[i, :]

        # Use spatial index on left_df to get indices of shapes with bounding boxes that
        # intersect with these bounds
        candidate_inds = sindex.intersects(shape_bounds)

        if len(candidate_inds) > 0:
            right_shape = right_geom[i]
            intersecting_mask = left_geom.intersects(right_shape, inds=candidate_inds)
            intersecting_inds = candidate_inds[intersecting_mask]
            left_inds[i] = intersecting_inds
            right_inds[i] = np.full(len(intersecting_inds), i)

    # Flatten nested arrays of indices
    if left_inds:
        flat_left_inds = np.concatenate(left_inds)
        flat_right_inds = np.concatenate(right_inds)
    else:
        flat_left_inds = np.array([], dtype='uint32')
        flat_right_inds = np.array([], dtype='uint32')

    # Build pandas DataFrame from inds
    result = pd.DataFrame({
        '_key_left': flat_left_inds,
        '_key_right': flat_right_inds
    })

    # Perform join
    if how == "inner":
        result = result.set_index("_key_left")
        joined = (
            left_df.merge(
                result, left_index=True, right_index=True
            ).merge(
                right_df.drop(right_df.geometry.name, axis=1),
                left_on="_key_right",
                right_index=True,
                suffixes=("_%s" % lsuffix, "_%s" % rsuffix),
            ).set_index(
                index_left
            ).drop(
                ["_key_right"], axis=1
            )
        )
        if len(left_index_name) > 1:
            joined.index.names = left_index_name
        else:
            joined.index.name = left_index_name[0]

    elif how == "left":
        result = result.set_index("_key_left")
        joined = (
            left_df.merge(
                result, left_index=True, right_index=True, how="left"
            ).merge(
                right_df.drop(right_df.geometry.name, axis=1),
                how="left",
                left_on="_key_right",
                right_index=True,
                suffixes=("_%s" % lsuffix, "_%s" % rsuffix),
            ).set_index(
                index_left
            ).drop(
                ["_key_right"], axis=1
            )
        )
        if len(left_index_name) > 1:
            joined.index.names = left_index_name
        else:
            joined.index.name = left_index_name[0]

    else:  # how == 'right':
        joined = (
            left_df.drop(
                left_df.geometry.name, axis=1
            ).merge(
                result.merge(
                    right_df, left_on="_key_right", right_index=True, how="right"
                ),
                left_index=True,
                right_on="_key_left",
                suffixes=("_%s" % lsuffix, "_%s" % rsuffix),
                how="right",
            ).set_index(
                index_right
            ).drop(
                ["_key_left", "_key_right"], axis=1
            )
        )
        if len(right_index_name) > 1:
            joined.index.names = right_index_name
        else:
            joined.index.name = right_index_name[0]

    return GeoDataFrame(joined)