示例#1
0
    def test_aggplot(self):
        try:
            gplt.aggplot(dataframe_gaussian_points, hue='mock_category', projection=gcrs.PlateCarree())

            gplt.aggplot(dataframe_gaussian_points, hue='mock_category', by='mock_category',
                         projection=gcrs.PlateCarree())
        finally:
            plt.close()
示例#2
0
    def test_aggplot(self):
        try:
            gplt.aggplot(dataframe_gaussian_points, hue='mock_category', projection=gcrs.PlateCarree())

            gplt.aggplot(dataframe_gaussian_points, hue='mock_category', by='mock_category',
                         projection=gcrs.PlateCarree())
        finally:
            plt.close()
示例#3
0
 def test_aggplot(self, projection, sankey_hue, legend_vars,
                  sankey_data_inputs):
     kwargs = {'projection': projection, 'hue': sankey_hue}
     kwargs = {**kwargs, **legend_vars, **sankey_data_inputs}
     try:
         gplt.aggplot(agg_data, **kwargs)
     finally:
         plt.close()
示例#4
0
def geoplot(df,
            filter=None,
            n=0,
            p=0,
            x=None,
            y=None,
            figsize=(25, 10),
            inline=False,
            by=None,
            cmap='YlGn',
            **kwargs):
    """
    Generates a geographical data nullity heatmap, which shows the distribution of missing data across geographic
    regions. The precise output depends on the inputs provided. If no geographical context is provided, a quadtree
    is computed and nullities are rendered as abstract geographic squares. If geographical context is provided in the
    form of a column of geographies (region, borough. ZIP code, etc.) in the `DataFrame`, convex hulls are computed
    for each of the point groups and the heatmap is generated within them.

    :param df: The DataFrame whose completeness is being geoplotted.
    :param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default).
    :param n: The cap on the number of columns to include in the filtered DataFrame.
    :param p: The cap on the percentage fill of the columns in the filtered DataFrame.
    :param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to `(25, 10)`.
    :param x: The variable in the dataset containing the x-coordinates of the dataset.
    :param y: The variable in the dataset containing the y-coordinates of the dataset.
    :param by: If specified, plot in convex hull mode, using the given column to cluster points in the same area. If
    not specified, plot in quadtree mode.
    :param cmap: The colormap to display the data with. Defaults to `YlGn`.
    :param inline: Whether or not the figure is inline. If it's not then instead of getting plotted, this method will
    return its figure.
    :param kwargs: Additional keyword arguments are passed to the underlying `geoplot` function.
    :return: If `inline` is False, the underlying `matplotlib.figure` object. Else, nothing.
    """
    warnings.warn(
        "The 'geoplot' function has been deprecated, and will be removed in a future version "
        "of missingno. The 'geoplot' package has an example recipe for a more full-featured "
        "geospatial nullity plot: "
        "https://residentmario.github.io/geoplot/gallery/plot_san_francisco_trees.html"
    )
    try:
        import geoplot as gplt
    except ImportError:
        raise ImportError(
            "Install geoplot <= 0.2.4 (the package) for geoplot function support"
        )

    if gplt.__version__ >= "0.3.0":
        raise ImportError(
            "The missingno geoplot function requires geoplot package version 0.2.4 or lower."
            "To use the geoplot function, downgrade to an older version of the geoplot package."
        )

    import geopandas as gpd
    from shapely.geometry import Point

    df = nullity_filter(df, filter=filter, n=n, p=p)

    nullity = df.notnull().sum(axis='columns') / df.shape[1]
    if x and y:
        gdf = gpd.GeoDataFrame(nullity,
                               columns=['nullity'],
                               geometry=df.apply(
                                   lambda srs: Point(srs[x], srs[y]),
                                   axis='columns'))
    else:
        raise ValueError("The 'x' and 'y' parameters must be specified.")

    if by:
        if df[by].isnull().any():
            warnings.warn(
                'The "{0}" column included null values. The offending records were dropped'
                .format(by))
            df = df.dropna(subset=[by])
            gdf = gdf.loc[df.index]

        vc = df[by].value_counts()
        if (vc < 3).any():
            warnings.warn(
                'Grouping by "{0}" included clusters with fewer than three points, which cannot be made '
                'polygonal. The offending records were dropped.'.format(by))
            where = df[by].isin((df[by].value_counts() >
                                 2).where(lambda b: b).dropna().index.values)
            gdf = gdf.loc[where]
        gdf[by] = df[by]

    gplt.aggplot(gdf,
                 figsize=figsize,
                 hue='nullity',
                 agg=np.average,
                 cmap=cmap,
                 by=by,
                 edgecolor='None',
                 **kwargs)
    ax = plt.gca()

    if inline:
        warnings.warn(
            "The 'inline' argument has been deprecated, and will be removed in a future version "
            "of missingno.")
        plt.show()
    else:
        return ax
示例#5
0
    def test_aggplot(self):
        try:
            gplt.aggplot(series_gaussian_points, hue=list_hue_values)
            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values)

            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values)
            gplt.aggplot(dataframe_gaussian_points, hue=series_hue_values)
            gplt.aggplot(dataframe_gaussian_points, hue=map_hue_values())
            gplt.aggplot(dataframe_gaussian_points, hue='hue_var')

            gplt.aggplot(dataframe_gaussian_points,
                         hue=list_hue_values,
                         by='mock_category')
            # series
            gplt.aggplot(dataframe_gaussian_points,
                         hue=list_hue_values,
                         by=dataframe_gaussian_points['mock_category'])
            # list
            gplt.aggplot(dataframe_gaussian_points,
                         hue=list_hue_values,
                         by=list(dataframe_gaussian_points['mock_category']))
            # map
            gplt.aggplot(dataframe_gaussian_points,
                         hue=list_hue_values,
                         by=map(
                             lambda v: v,
                             list(dataframe_gaussian_points['mock_category'])))
            gplt.aggplot(dataframe_gaussian_points,
                         hue=list_hue_values,
                         by='mock_category',
                         geometry=aggplot_geometries)
            gplt.aggplot(dataframe_gaussian_points,
                         hue=list_hue_values,
                         by=dataframe_gaussian_points['mock_category'],
                         geometry=aggplot_geometries)  # Series
            gplt.aggplot(dataframe_gaussian_points,
                         hue=list_hue_values,
                         by=list(dataframe_gaussian_points['mock_category']),
                         geometry=aggplot_geometries)  # List
            gplt.aggplot(dataframe_gaussian_points,
                         hue=list_hue_values,
                         by=map(
                             lambda v: v,
                             list(dataframe_gaussian_points['mock_category'])),
                         geometry=aggplot_geometries)  # Map

        finally:
            plt.close('all')
示例#6
0
manhattan = manhattan.to_crs(epsg=4326)
manhattan = manhattan.reset_index(drop=True)
manhattan = manhattan.reset_index().rename(columns={'index': 'n'})

# Plot the data.

# This plot demonstrates an extremely useful trick. When used with a provided geometry, the aggplot plot type expects
# an iterable of geometries to be used for binning observations. The idea is that, in general, we have n observations
# and some smaller number k of locations containing them, and we will match observations within the same bin,
# average them in some way, and plot the result.
#
# Of course, what if n == k? In other words, what if every observation comes with its own location? In that case we
# can can pass those locations to the ``geometry`` parameter and pass the data's index to the ``by`` parameter,
# and ``aggplot`` will plot all of our records one at a time!
#
# This is a nice feature to have, and very useful for a wide variety of datasets. In this case we are plotting
# building ages in Manhattan using data taken from MapPLUTO
# (http://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page).
#
# Note that this plot is for the purposes of example only: it contains 40,000 geometries (far more than palatable)
# and so takes a long time to render. To explore the data for real take a look at this all-NYC webmap:
# http://pureinformation.net/building-age-nyc/.
ax = gplt.aggplot(manhattan,
                  projection=gcrs.PlateCarree(),
                  geometry=manhattan.geometry,
                  by=pd.Series(manhattan.index),
                  hue='YearBuilt',
                  linewidth=0)

ax.set_title("Buildings in Manhattan by Year Built")
plt.savefig("aggplot-singular.png", bbox_inches='tight', pad_inches=0.1)
示例#7
0
def geoplot(df,
            filter=None, n=0, p=0, sort=None,
            x=None, y=None, figsize=(25, 10), inline=False,
            by=None, cmap='YlGn', **kwargs):
    """
    Generates a geographical data nullity heatmap, which shows the distribution of missing data across geographic
    regions. The precise output depends on the inputs provided. If no geographical context is provided, a quadtree
    is computed and nullities are rendered as abstract geographic squares. If geographical context is provided in the
    form of a column of geographies (region, borough. ZIP code, etc.) in the `DataFrame`, convex hulls are computed
    for each of the point groups and the heatmap is generated within them.

    :param df: The DataFrame whose completeness is being geoplotted.
    :param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default).
    :param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None.
    :param n: The cap on the number of columns to include in the filtered DataFrame.
    :param p: The cap on the percentage fill of the columns in the filtered DataFrame.
    :param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to `(25, 10)`.
    :param x: The variable in the dataset containing the x-coordinates of the dataset.
    :param y: The variable in the dataset containing the y-coordinates of the dataset.
    :param by: If specified, plot in convex hull mode, using the given column to cluster points in the same area. If
    not specified, plot in quadtree mode.
    :param cmap: The colormap to display the data with. Defaults to `YlGn`.
    :param inline: Whether or not the figure is inline. If it's not then instead of getting plotted, this method will
    return its figure.
    :param kwargs: Additional keyword arguments are passed to the underlying `geoplot` function.
    :return: If `inline` is False, the underlying `matplotlib.figure` object. Else, nothing.
    """
    import geoplot as gplt
    import geopandas as gpd
    from shapely.geometry import Point

    df = nullity_filter(df, filter=filter, n=n, p=p)
    df = nullity_sort(df, sort=sort)

    nullity = df.notnull().sum(axis='columns') / df.shape[1]
    if x and y:
        gdf = gpd.GeoDataFrame(nullity, columns=['nullity'],
                               geometry=df.apply(lambda srs: Point(srs[x], srs[y]), axis='columns'))
    else:
        raise ValueError("The 'x' and 'y' parameters must be specified.")

    if by:
        if df[by].isnull().any():
            warnings.warn('The "{0}" column included null values. The offending records were dropped'.format(by))
            df = df.dropna(subset=[by])
            gdf = gdf.loc[df.index]

        vc = df[by].value_counts()
        if (vc < 3).any():
            warnings.warn('Grouping by "{0}" included clusters with fewer than three points, which cannot be made '
                          'polygonal. The offending records were dropped.'.format(by))
            where = df[by].isin((df[by].value_counts() > 2).where(lambda b: b).dropna().index.values)
            gdf = gdf.loc[where]
        gdf[by] = df[by]

    gplt.aggplot(gdf, figsize=figsize, hue='nullity', agg=np.average, cmap=cmap, by=by, edgecolor='None', **kwargs)
    ax = plt.gca()

    if inline:
        plt.show()
    else:
        return ax
示例#8
0
# Load the data (uses the `quilt` package).
from quilt.data.ResidentMario import geoplot_data
import geopandas as gpd

boston_zip_codes = gpd.read_file(geoplot_data.boston_zip_codes())
boston_zip_codes = boston_zip_codes.assign(id=boston_zip_codes.id.astype(float)).set_index('id')

listings = gpd.read_file(geoplot_data.boston_airbnb_listings())
listings = listings.assign(zipcode=listings.zipcode.astype(float))


# Plot the data.
import geoplot as gplt
import geoplot.crs as gcrs
import numpy as np
import matplotlib.pyplot as plt

ax = gplt.polyplot(boston_zip_codes.geometry, projection=gcrs.AlbersEqualArea(),
                   facecolor='lightgray', edgecolor='gray', linewidth=0)

gplt.aggplot(listings, projection=gcrs.AlbersEqualArea(), hue='price',
             by='zipcode', geometry=boston_zip_codes.geometry, agg=np.median, ax=ax,
             linewidth=0)


ax.set_title("Median AirBnB Price by Boston Zip Code, 2016")
plt.savefig("boston-airbnb-aggplot.png", bbox_inches='tight', pad_inches=0.1)
# Plot the data.
import geoplot as gplt
import geoplot.crs as gcrs
import numpy as np
import matplotlib.pyplot as plt

f, axarr = plt.subplots(3, 1, figsize=(12, 12), subplot_kw={
    'projection': gcrs.AlbersEqualArea(central_latitude=40.7128, central_longitude=-74.0059)
})
plt.suptitle('Max(Injuries) in Collision by Area, 2016', fontsize=16)
plt.subplots_adjust(top=0.95)


ax1 = gplt.aggplot(collisions, projection=gcrs.AlbersEqualArea(),
                   hue='NUMBER OF PERSONS INJURED', agg=np.max, cmap='Reds',
                   nmin=100, nmax=500,
                   linewidth=0.5, edgecolor='white',
                   ax=axarr[0])
ax1.set_title("No Geometry (Quadtree)")


ax2 = gplt.aggplot(collisions, projection=gcrs.AlbersEqualArea(),
                   hue='NUMBER OF PERSONS INJURED', agg=np.max, cmap='Reds', by='ZIP CODE',
                   linewidth=0.5, edgecolor='white',
                   ax=axarr[1])
ax2.set_title("Categorical Geometry (Convex Hull)")


zip_codes = gplt.datasets.load('nyc-zip-codes')
ax3 = gplt.aggplot(collisions, projection=gcrs.AlbersEqualArea(),
                   hue='NUMBER OF PERSONS INJURED', agg=np.max, by='ZIP CODE', geometry=zip_codes.geometry,
示例#10
0
import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import numpy as np
import matplotlib.pyplot as plt

# load the data
boston_zip_codes = gpd.read_file(gplt.datasets.get_path('boston_zip_codes'))
boston_zip_codes = boston_zip_codes.assign(
    id=boston_zip_codes.id.astype(float)).set_index('id')
boston_airbnb_listings = gpd.read_file(
    gplt.datasets.get_path('boston_airbnb_listings'))

proj = gcrs.AlbersEqualArea()
ax = gplt.polyplot(boston_zip_codes,
                   projection=proj,
                   facecolor='lightgray',
                   edgecolor='gray',
                   linewidth=0)
gplt.aggplot(boston_airbnb_listings,
             projection=proj,
             hue='price',
             by='zipcode',
             geometry=boston_zip_codes,
             agg=np.median,
             ax=ax,
             linewidth=0)

ax.set_title("Median AirBnB Price by Boston Zip Code, 2016")
plt.savefig("boston-airbnb-aggplot.png", bbox_inches='tight', pad_inches=0.1)
                        })
plt.suptitle('Max(Injuries) in Collision by Area, 2016', fontsize=16)
plt.subplots_adjust(top=0.95)

# In the first plot we do not provide any geographic data at all as input. In this case aggplot takes the centroids
# of whatever it is that we are throwing at it and uses them to decompose the boundaries of our data into squares,
# with a cetain user specified minimum (nmin) and maximum (nmax) number of observations per square. This is known in
# the literature as a QuadTree. An additional parameter, nsig, controls how many observations have to be made in a
# square for that square to be considered significant (insignificant and empty squares are not colored in). The agg
# parameter controls the method by which the observations are aggregated---in the default case np.mean is used,
# in this case we have specified a maximum (np.max) instead.
ax1 = gplt.aggplot(collisions,
                   projection=gcrs.AlbersEqualArea(),
                   hue='NUMBER OF PERSONS INJURED',
                   agg=np.max,
                   nmin=100,
                   nmax=500,
                   cmap='Reds',
                   linewidth=0.5,
                   edgecolor='white',
                   ax=axarr[0])
ax1.set_title("No Geometry (Quadtree)")

# In the second plot we provide more information than the first, by specifying a categorical column of data in the
# dataset corresponding with sort of encoded geography---in this example, the postal zip code. Aggplot computes the
# geometries it needs itself, using a simple convex hull around the observations' point cloud. Albeit not elegant,
# the resulting geometry is functional---and, again, spares us the task of having to find our own.
ax2 = gplt.aggplot(collisions,
                   projection=gcrs.AlbersEqualArea(),
                   hue='NUMBER OF PERSONS INJURED',
                   agg=np.max,
                   by='ZIP CODE',
示例#12
0
    def test_aggplot(self):
        try:
            gplt.aggplot(series_gaussian_points, hue=list_hue_values)
            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values)

            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values)
            gplt.aggplot(dataframe_gaussian_points, hue=series_hue_values)
            gplt.aggplot(dataframe_gaussian_points, hue=map_hue_values())
            gplt.aggplot(dataframe_gaussian_points, hue='hue_var')

            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by='mock_category')
            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values,
                         by=dataframe_gaussian_points['mock_category'])  # Series
            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values,
                         by=list(dataframe_gaussian_points['mock_category']))  # List
            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values,
                         by=map(lambda v: v, list(dataframe_gaussian_points['mock_category'])))  # Map

            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by='mock_category',
                         geometry=aggplot_geometries)
            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values,
                         by=dataframe_gaussian_points['mock_category'],
                         geometry=aggplot_geometries)  # Series
            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values,
                         by=list(dataframe_gaussian_points['mock_category']),
                         geometry=aggplot_geometries)  # List
            gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values,
                         by=map(lambda v: v, list(dataframe_gaussian_points['mock_category'])),
                         geometry=aggplot_geometries)  # Map

        finally:
            plt.close('all')
示例#13
0
manhattan = manhattan.reset_index().rename(columns={'index': 'n'})


# Plot the data.

# This plot demonstrates an extremely useful trick. When used with a provided geometry, the aggplot plot type expects
# an iterable of geometries to be used for binning observations. The idea is that, in general, we have n observations
# and some smaller number k of locations containing them, and we will match observations within the same bin,
# average them in some way, and plot the result.
#
# Of course, what if n == k? In other words, what if every observation comes with its own location? In that case we
# can can pass those locations to the ``geometry`` parameter and pass the data's index to the ``by`` parameter,
# and ``aggplot`` will plot all of our records one at a time!
#
# This is a nice feature to have, and very useful for a wide variety of datasets. In this case we are plotting
# building ages in Manhattan using data taken from MapPLUTO
# (http://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page).
#
# Note that this plot is for the purposes of example only: it contains 40,000 geometries (far more than palatable)
# and so takes a long time to render. To explore the data for real take a look at this all-NYC webmap:
# http://pureinformation.net/building-age-nyc/.
ax = gplt.aggplot(manhattan,
                  projection=gcrs.PlateCarree(),
                  geometry=manhattan.geometry,
                  by=pd.Series(manhattan.index),
                  hue='YearBuilt',
                  linewidth=0)


ax.set_title("Buildings in Manhattan by Year Built")
plt.savefig("aggplot-singular.png", bbox_inches='tight', pad_inches=0.1)