示例#1
0
 def test_pointplot(self, projection, hue_vars, scale_vars, legend_vars):
     kwargs = {'projection': projection}
     kwargs = {**kwargs, **hue_vars, **scale_vars, **legend_vars}
     try:
         gplt.pointplot(gaussian_points, **kwargs)
     finally:
         plt.close()
def plot_droughts_per_district(data,
                               label_col='drought reported',
                               district_col='District',
                               path='../',
                               country='Uganda',
                               admin_level=1):
    droughts_per_district = data[[district_col, label_col
                                  ]].groupby(district_col).sum().reset_index()
    gdf_country = gpd.read_file(get_country_shapefile(path=path,
                                                      country=country,
                                                      admin_level=admin_level),
                                crs='')

    gdf_country.rename(columns={'ADM1_EN': district_col}, inplace=True)
    gdf_country['centroid'] = gdf_country.centroid

    droughts_per_district = gdf_country[[district_col, 'geometry', 'centroid'
                                         ]].merge(droughts_per_district,
                                                  on=district_col)
    droughts_per_district.set_geometry('centroid', drop=True, inplace=True)
    droughts_per_district = droughts_per_district[
        droughts_per_district[label_col] > 0]

    geoplot.polyplot(gdf_country)
    ax = plt.gca()
    geoplot.pointplot(droughts_per_district,
                      scale=label_col,
                      color='darkred',
                      marker='o',
                      limits=(2, 14),
                      legend=True,
                      legend_values=[1, 3, 6, 9, 12],
                      ax=ax)
    return
示例#3
0
def _save_image(shape: gpd.GeoDataFrame, data: gpd.GeoDataFrame,
                output_file: str):
    fig, ax = plt.subplots(figsize=(6, 6))
    gplt.polyplot(shape, ax=ax, zorder=1)
    gplt.pointplot(data, color="red", s=.5, ax=ax, zorder=2)
    shape_bounds = shape.total_bounds
    ax.set_ylim(shape_bounds[1], shape_bounds[3])
    ax.set_xlim(shape_bounds[0], shape_bounds[2])
    logging.info(f"Saving image to {output_file}")
    plt.savefig(output_file, bbox_inches='tight', pad_inches=0.1, dpi=300)
    # TODO: Solve "RuntimeWarning: More than 20 figures have been opened."
    plt.clf()
示例#4
0
def draw_month(month):
    to_full_month = {
        'Jan': 'January',
        'Feb': 'February',
        'Mar': 'March',
        'Apr': 'April',
        'May': 'May',
        'Jun': 'June',
        'Jul': 'July'
    }
    frames = []
    for i in range(1, 32):
        day_str = str(i)
        if i < 10:
            day_str = '0' + str(i)
        if os.path.exists(month + ' ' + day_str + '.csv'):
            df1 = pd.read_csv(month + ' ' + day_str + '.csv',
                              header=None,
                              names=[
                                  'id', 'longitude', 'latitude', 'location',
                                  'created_at', 'lang'
                              ])
            frames.append(df1)
    df = pd.concat(frames)
    print(df.shape)
    mydict = dict(df.location.value_counts())
    df['notnan'] = df['location'].notna()
    df['count'] = df.apply(lambda x: mydict[x.location] if x.notnan else 1,
                           axis=1)
    df.drop_duplicates(subset='location', keep='first', inplace=True)
    gdf = gpd.GeoDataFrame(df,
                           geometry=gpd.points_from_xy(df.longitude,
                                                       df.latitude))

    scheme = mc.Quantiles(df['count'], k=5)
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

    ax = gplt.polyplot(
        world,
        edgecolor='white',
        facecolor='lightgray',
    )
    gplt.pointplot(gdf,
                   ax=ax,
                   hue='count',
                   cmap='Reds',
                   scale='count',
                   scheme=scheme,
                   legend=True,
                   legend_var='hue')
    ax.set_title('Discussion on Twitter, ' + to_full_month[month], fontsize=10)
    plt.savefig(month + '.png', dpi=1000)
示例#5
0
def test_param_extent_unproj():
    # invalid extent: raise
    with pytest.raises(ValueError):
        pointplot(p_df, extent=(-181, 0, 1, 1))
    with pytest.raises(ValueError):
        pointplot(p_df, extent=(0, -91, 1, 1))
    with pytest.raises(ValueError):
        pointplot(p_df, extent=(0, 0, 181, 1))
    with pytest.raises(ValueError):
        pointplot(p_df, extent=(0, 0, 1, 91))

    # valid extent: set
    return pointplot(p_df, hue='var', linewidth= 0, s=10, extent=(-10, -10, 10, 10)).get_figure()
示例#6
0
def plot_point_map(gpd_gdf, percentile=0, save_file=None):
    """plot point data on a map"""
    # Choose points in which NSE value are bigger than the 25% quartile value range
    percentile_data = np.percentile(gpd_gdf['NSE'].values,
                                    percentile).astype(float)
    # the result of query is a tuple with one element, but it's right for plotting
    data_chosen = gpd_gdf.query("NSE > " + str(percentile_data))
    contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa'))
    proj = gcrs.AlbersEqualArea(central_longitude=-98, central_latitude=39.5)
    polyplot_kwargs = {'facecolor': (0.9, 0.9, 0.9), 'linewidth': 0}
    pointplot_kwargs = {'hue': 'NSE', 'legend': True, 'linewidth': 0.01}
    # ax = gplt.polyplot(contiguous_usa.geometry, projection=proj, **polyplot_kwargs)
    ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator())
    gplt.pointplot(data_chosen, ax=ax, **pointplot_kwargs)
    ax.set_title("NSE " + "Map")
    plt.show()
    if save_file is not None:
        plt.savefig(save_file)
示例#7
0
def draw_per_day(date):
    if os.path.exists(date + '.csv') is False:
        return
    print(date)
    df = pd.read_csv(date + '.csv',
                     header=None,
                     names=[
                         'id', 'longitude', 'latitude', 'location',
                         'created_at', 'lang'
                     ])
    print(date, df.shape)
    # assign count value
    mydict = dict(df.location.value_counts())

    df['notnan'] = df['location'].notna()
    df['count'] = df.apply(lambda x: mydict[x.location] if x.notnan else 1,
                           axis=1)
    df.drop_duplicates(subset='location', keep='first', inplace=True)
    gdf = gpd.GeoDataFrame(df,
                           geometry=gpd.points_from_xy(df.longitude,
                                                       df.latitude))

    scheme = mc.Quantiles(df['count'], k=5)
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

    ax = gplt.polyplot(
        world,
        edgecolor='white',
        facecolor='lightgray',
    )
    gplt.pointplot(gdf,
                   ax=ax,
                   hue='count',
                   cmap='Reds',
                   scale='count',
                   scheme=scheme,
                   legend=True,
                   legend_var='hue')
    ax.set_title('Discussion on Twitter, ' + date, fontsize=10)
    plt.savefig(date + '.png', dpi=1000)
示例#8
0
    def test_pointplot(self):
        try:
            gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), color='white')

            gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), s=5)

            gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), legend_kwargs={'fancybox': False})
        finally: plt.close()
示例#9
0
    def test_pointplot(self):
        try:
            gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), color='white')

            gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), s=5)

            gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), legend_kwargs={'fancybox': False})
        finally: plt.close()
#geospatial areas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
Europe = (world.loc[world['continent'] == 'Europe'])
USA = (world.loc[world['continent'] == 'North America'])
Japan = (world.loc[world['name'] == 'Japan'])
Korea = (world.loc[world['name'] == 'South Korea'])

#geospatial pointplots
ax = gplt.polyplot(world,linewidth=0.7)
gplt.pointplot(gdf,
               hue = query,
               cmap = 'rainbow',
               k = 2,
               alpha = 0.8,
               scale = query,
               limits = (20, 20),
               legend = True,
               legend_values = [-1,1],
               legend_labels = ['negative','positive'],
               ax = ax)
plt.title('Global Station Distribution For Stations With Valid Data Series')

Eu_ax = gplt.polyplot(Europe,linewidth=0.7)
gplt.pointplot(Eu_gdf,
               hue = query,
               cmap = 'rainbow',
               k = 2,
               alpha = 0.8,
               scale = query,
               limits = (30,30),
示例#11
0
ax_hist2.set(ylabel='Japan-S.Korea')

sns.despine(ax=ax_hist)
sns.despine(ax=ax_hist1)
sns.despine(ax=ax_hist2)

#geospatial locations
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
Europe = (world.loc[world['continent'] == 'Europe'])
USA = (world.loc[world['continent'] == 'North America'])
Japan = (world.loc[world['name'] == 'Japan'])
Korea = (world.loc[world['name'] == 'South Korea'])

#plot of station distribution
ax = gplt.polyplot(world, linewidth=0.7)
gplt.pointplot(gdf, color='red', ax=ax)

#plt.title('Global Station Distribution For Stations With Valid Data Series')

#geospatial plots (titles are comented out)
Eu_ax = gplt.polyplot(Europe, linewidth=0.7)
gplt.pointplot(Eu_gdf,
               hue=query,
               cmap='plasma',
               k=None,
               alpha=1,
               scale=query,
               limits=(25, 25),
               legend=True,
               ax=Eu_ax)
df_map = gpd.GeoDataFrame.from_file('Virtual_Map1.shp')
df_city = pd.read_csv("Virtual_City.csv")
geom = gpd.GeoSeries(
    [Point(x, y) for x, y in zip(df_city.long.values, df_city.lat.values)])
df_city = gpd.GeoDataFrame(df_city, geometry=geom)

#--------------------------------- (a)黑白沃罗诺伊图.----------------------------------------
ax1 = gplt.voronoi(
    df_city,  #projection=gcrs.AlbersEqualArea(),
    clip=df_map,
    linewidth=0.5,
    #hue='orange', cmap='Reds',k=5,
    legend=False,
    edgecolor='k')

ax2 = gplt.pointplot(df_city, color='white', s=10, edgecolors='k',
                     ax=ax1)  #zorder=2,
gplt.polyplot(df_map, edgecolor='none', facecolor='lightgray',
              ax=ax1)  #zorder=1,
#plt.savefig('沃罗诺伊地图2.pdf')

#--------------------------------- (b)彩色沃罗诺伊图.----------------------------------------
ax = gplt.voronoi(
    df_city,  #projection=gcrs.AlbersEqualArea(),
    clip=df_map,
    hue='city',
    cmap='Set1',
    legend=True,
    k=10,
    edgecolor='w',
    alpha=0.75,
    legend_kwargs={
示例#13
0
def test_scale_params(kwargs):
    return pointplot(p_df, **kwargs).get_figure()
示例#14
0
def test_legend_params(kwargs):
    return pointplot(p_df, **kwargs).get_figure()
示例#15
0
    def test_pointplot(self):
        try:
            gplt.pointplot(series_gaussian_points, k=2)
            gplt.pointplot(dataframe_gaussian_points, k=2)

            gplt.pointplot(dataframe_gaussian_points,
                           hue=list_hue_values,
                           k=None)
            gplt.pointplot(dataframe_gaussian_points,
                           hue=series_hue_values,
                           k=None)
            gplt.pointplot(dataframe_gaussian_points,
                           hue=map_hue_values(),
                           k=None)
            gplt.pointplot(dataframe_gaussian_points, hue='hue_var', k=None)
        finally:
            plt.close('all')
示例#16
0
"""
Quadtree of NYC traffic collisions
==================================

This example plots traffic collisions in New York City. Overlaying a ``pointplot`` on a
``quadtree`` like this communicates information on two visual channels, position and texture,
simultaneously.
"""

import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt

nyc_boroughs = gpd.read_file(gplt.datasets.get_path('nyc_boroughs'))
collisions = gpd.read_file(gplt.datasets.get_path('nyc_collision_factors'))

ax = gplt.quadtree(collisions,
                   nmax=1,
                   projection=gcrs.AlbersEqualArea(),
                   clip=nyc_boroughs,
                   facecolor='lightgray',
                   edgecolor='white',
                   zorder=0)
gplt.pointplot(collisions, s=1, ax=ax)

plt.title("New York Ciy Traffic Collisions, 2016")
示例#17
0
def geospatial_viz(geo_data_url,
                   point_data_url=None,
                   att_var=None,
                   map_type=None):
    '''
    function to visualize the attribute information in map. (eg, population in states)
    geo_att_data: geodataframe that contains both geometry and attributes info
    att_var: the attributes to be visualized in the map
    map_type: string, the type of map to be viz. pointplot, choropleth, voronoi
    
    if point_data = None, att_var must be from geo_data
    
    '''
    geo_data = gpd.read_file(geo_data_url)
    print(geo_data.head())

    if point_data_url == 'No point attribute data':
        if att_var is None:
            ax = gplt.polyplot(geo_data, figsize=(10, 5))
            ax.set_title('plain map of continental USA', fontsize=16)
        else:
            if map_type == 'choropleth':
                scheme = mc.FisherJenks(geo_data[att_var], k=5)
                labels = scheme.get_legend_classes()
                ax = gplt.polyplot(geo_data, projection=gcrs.AlbersEqualArea())
                gplt.choropleth(geo_data,
                                hue=att_var,
                                edgecolor='white',
                                linewidth=1,
                                cmap='Blues',
                                legend=True,
                                scheme=scheme,
                                legend_labels=labels,
                                ax=ax)
                ax.set_title('{} in the continental US'.format(att_var),
                             fontsize=16)

            if map_type == "cartogram":
                gplt.cartogram(geo_data,
                               scale=att_var,
                               edgecolor='black',
                               projection=gcrs.AlbersEqualArea())

    else:
        point_data = gpd.read_file(point_data_url)
        scheme = mc.Quantiles(point_data[att_var], k=5)
        labels = scheme.get_legend_classes()

        if map_type == 'pointplot':
            if isinstance(point_data.geometry[0],
                          shapely.geometry.point.Point):
                ax = gplt.polyplot(geo_data,
                                   edgecolor='white',
                                   facecolor='lightgray',
                                   figsize=(12, 8)
                                   #projection = gcrs.AlbersEqualArea()
                                   )
                gplt.pointplot(point_data,
                               ax=ax,
                               hue=att_var,
                               cmap='Blues',
                               scheme=scheme,
                               scale=att_var,
                               legend=True,
                               legend_var='scale',
                               legend_kwargs={"loc": 'lower right'},
                               legend_labels=labels)
                ax.set_title(
                    'Cities in the continental US, by population 2010',
                    fontsize=16)
            else:
                print('Geometry data type not valid')

        if map_type == "voronoi":
            # check uniqueness of coordinates
            duplicates = point_data.geometry.duplicated()
            point_data_unique = point_data[-duplicates]
            proj = gplt.crs.AlbersEqualArea(central_longitude=-98,
                                            central_latitude=39.5)

            ax = gplt.voronoi(point_data_unique,
                              hue=att_var,
                              clip=geo_data,
                              projection=proj,
                              cmap='Blues',
                              legend=True,
                              edgecolor="white",
                              linewidth=0.01)

            gplt.polyplot(geo_data,
                          ax=ax,
                          extent=geo_data.total_bounds,
                          edgecolor="black",
                          linewidth=1,
                          zorder=1)
            plt.title("{} in US cities".format(att_var), fontsize=16)
示例#18
0
# ## Making Maps

# %% Collapsed="false"
f, ax = plt.subplots(dpi = 200)
countries.plot(edgecolor = 'k', facecolor = 'None', linewidth = 0.6, ax = ax)
cities.plot(markersize = 0.5, facecolor = 'red', ax = ax)
lat_am_capitals.plot(markersize = 0.5, facecolor = 'y', ax = ax)
ax.set_title('World Map')
ax.set_axis_off()

# %% [markdown] Collapsed="false"
# ## Static Webmaps

# %% Collapsed="false"
ax = gplt.webmap(countries, projection=gplt.crs.WebMercator(), figsize = (16, 12))
gplt.pointplot(cities, ax=ax, hue = 'POP2015')

# %% [markdown] Collapsed="false"
# ## Aside on Projections

# %% [markdown] Collapsed="false"
# Map projections flatten a globe's surface onto a 2D plane. This necessarily distorts the surface (one of Gauss' lesser known results), so one must choose specific form of 'acceptable' distortion.
#
# By convention, the standard projection in GIS is World Geodesic System(lat/lon - `WGS84`). This is a cylindrical projection, which stretches distances east-west and *results in incorrect distance and areal calculations*. For accurate distance and area calculations, try to use UTM (which divides map into zones). See [epsg.io](epsg.io)

# %% Collapsed="false"
countries.crs

# %% Collapsed="false"
countries_2 = countries.copy()
countries_2 = countries_2.to_crs({'init': 'epsg:3035'})
示例#19
0
def main():
    use_trim = True
    update_data = True

    root_path = os.getcwd()
    cache_path = os.path.join(root_path,
                              r'data/202008-citibike-tripdata-trimmed.pickle')
    cache_path_full = os.path.join(
        root_path, r'data/202008-citibike-tripdata-full.pickle')

    if (update_data == False) and (use_trim):
        data = pickle.load(open(cache_path, 'rb'))
        print(f'Loaded trimmed data from {cache_path}')
    elif (update_data == False) and (use_trim == False):
        data = pickle.load(open(cache_path_full, 'rb'))
    else:
        data_JC = pd.read_csv(
            os.path.join(root_path, r'data/JC-202008-citibike-tripdata.csv'))
        data_NY = pd.read_csv(
            os.path.join(root_path, r'data/202008-citibike-tripdata.csv'))
        print(f'Loaded full data')
        if use_trim:
            data_NY_part = data_NY[::100]
            # data=data_process_data(pd.concat([data_JC, data_NY_part]))
            data = data_process_data(copy.deepcopy(data_NY_part))
            pickle.dump(data, open(cache_path, 'wb'))
            print(f'Use trim data, saved a cache into {cache_path}')
        else:
            data_NY_part = data_NY[::10]
            # data=data_process_data(pd.concat([data_JC, data_NY_part]))
            data = data_process_data(copy.deepcopy(data_NY_part))
            pickle.dump(data, open(cache_path_full, 'wb'))

            print(f'Use full data, saved a cache into {cache_path_full}')

    mask = data[data['start station id'] == data['end station id']].index
    data = data.drop(index=mask)

    map_JC = gpd.read_file(
        os.path.join(
            root_path,
            r'Data/jersey-city-neighborhoods/jersey-city-neighborhoods.shp')
    ).to_crs(epsg=4326)
    map_JC = map_JC[['name', 'geometry']]
    map_JC['name'] = map_JC['name'].apply(lambda x: f'JC {x}')
    map_JC['region'] = 'JC'
    map_JC.columns = ['area', 'geometry', 'boro']
    map_NY = gpd.read_file(
        os.path.join(
            root_path,
            r'Data/Neighborhood Tabulation Areas/NY neighborhoods.shp')
    ).to_crs(epsg=4326)
    map_NY = map_NY[['ntaname', 'geometry', 'boro_name']]
    map_NY.columns = ['area', 'geometry', 'boro']
    map = pd.concat([map_JC, map_NY], ignore_index=True)
    map['centroid'] = map.geometry.centroid

    # EDA
    run_eda = False
    if run_eda:
        plt.close('all')
        data['start_hr'].value_counts(sort=False).plot(kind='bar')
        data['start_weekday'].value_counts(sort=False).plot(kind='bar')
        data['usertype'].value_counts(sort=False).plot(kind='bar')
        data.groupby('usertype')['start_weekday'].value_counts(
            sort=False).plot(kind='bar')
        data.groupby('usertype')['start_hr'].value_counts(sort=False).plot(
            kind='bar')
        ax = data[data['usertype'] == 'Subscriber'].groupby([
            'start_weekday'
        ])['start_hr'].value_counts(sort=False).plot(kind='bar')
        data[data['usertype'] == 'Customer'].groupby([
            'start_weekday'
        ])['start_hr'].value_counts(sort=False).plot(kind='bar',
                                                     ax=ax,
                                                     color='red')
        ax.xaxis.set_major_locator(ticker.NullLocator())
        # Outlier on the first two days - need to remove

    # get map and station info with area
    station_profile = summarize_station(data)
    station_profile = gpd.GeoDataFrame(
        station_profile,
        geometry=gpd.points_from_xy(station_profile['station longitude'],
                                    station_profile['station latitude']),
        crs={
            'init': 'epsg:4326',
            'no_defs': True
        })

    station_profile_gis = gpd.sjoin(station_profile,
                                    map,
                                    how='left',
                                    op='within')

    # summarize net rides by station by hour
    data = pd.merge(data,
                    area_concat(
                        'start',
                        station_profile_gis[['station id', 'area', 'boro']]),
                    how='left',
                    on='start station id')
    data = pd.merge(data,
                    area_concat(
                        'end',
                        station_profile_gis[['station id', 'area', 'boro']]),
                    how='left',
                    on='end station id')

    # group by station
    rides_byStation = summary_ride(data, 'station id')
    rides_byStation_byHour = summarize_rides_by_hour(rides_byStation,
                                                     'station id')

    # group by area
    rides_byArea = summary_ride(data, 'area')
    # len(rides_byArea[rides_byArea['net_checkout'].apply(lambda x: isinstance(x, float)==False)])
    rides_byArea_byHour = summarize_rides_by_hour(rides_byArea, 'area')
    # rides_byArea_byHour_gis=gpd.GeoDataFrame(rides_byArea_byHour.merge(map[['boro','area','centroid']], on='area'), geometry='centroid')
    rides_byArea_byHour_gis = gpd.GeoDataFrame(rides_byArea_byHour.merge(
        map[['boro', 'area', 'geometry']], on='area'),
                                               geometry='geometry')

    plot_rides_on_map = False
    if plot_rides_on_map:
        rides_byStation_byHour_gis = pd.merge(rides_byStation_byHour,
                                              station_profile_gis,
                                              on='station id')
        for i in range(0, 24):
            ax = map.plot(figsize=(8, 8), alpha=0.5, edgecolor='k')
            # rides_byStation_byHour_gis.plot(ax=ax, color='red', markersize=rides_byStation_byHour_gis[0])
            select_hr = str(i)
            gplt.pointplot(rides_byStation_byHour_gis[[select_hr, 'geometry']],
                           hue=select_hr,
                           scale=select_hr,
                           ax=ax,
                           legend=True,
                           legend_var='hue')
            plt.savefig(
                os.path.join(root_path,
                             r'plots/202008_station_' + select_hr + '.png'))
            # lda/pca to reduce features
            plt.close('all')

        for i in range(0, 24):
            ax = map.plot(figsize=(8, 8), alpha=0.5, edgecolor='k')

            # rides_byStation_byHour_gis.plot(ax=ax, color='red', markersize=rides_byStation_byHour_gis[0])
            select_hr = str(i)
            # gplt.pointplot(rides_byArea_byHour_gis[[select_hr, 'centroid']], hue=select_hr, scale=select_hr, ax=ax, legend=True,
            #                legend_var='hue')
            rides_byArea_byHour_gis.plot(column=select_hr, ax=ax, legend=True)

            plt.savefig(
                os.path.join(
                    root_path,
                    r'plots/202008_area_choropleth_' + select_hr + '.png'))
            plt.close('all')

    data['distance'] = abs(data['end station longitude'] -
                           data['start station longitude']) + abs(
                               data['end station latitude'] -
                               data['start station latitude'])
    data.drop(index=data[data['distance'] == 0].index, inplace=True)
    data['speed'] = data['distance'] / data['tripduration']
    # data['start_area_net_checkout'] = data[['start area','start_date_hr']].apply(
    #     lambda x: rides_byArea[((rides_byArea['area']==x.iloc[0]) & (rides_byArea['date_hour'] == x.iloc[1]))]['net_checkout'])
    start_area_checkout = rides_byArea[['area', 'date_hour', 'net_checkout']]
    start_area_checkout.columns = [
        'start area', 'start_date_hr', 'start_area_net_checkout'
    ]
    data = pd.merge(data,
                    start_area_checkout,
                    on=['start area', 'start_date_hr'],
                    how='left')
    end_area_checkout = rides_byArea[['area', 'date_hour', 'net_checkout']]
    end_area_checkout.columns = [
        'end area', 'stop_date_hr', 'end_area_net_checkout'
    ]
    data = pd.merge(data,
                    end_area_checkout,
                    on=['end area', 'stop_date_hr'],
                    how='left')

    start_station_checkout = rides_byStation[[
        'station id', 'date_hour', 'net_checkout'
    ]]
    start_station_checkout.columns = [
        'start station id', 'start_date_hr', 'start_station_net_checkout'
    ]
    data = pd.merge(data,
                    start_station_checkout,
                    on=['start station id', 'start_date_hr'],
                    how='left')

    end_station_checkout = rides_byStation[[
        'station id', 'date_hour', 'net_checkout'
    ]]
    end_station_checkout.columns = [
        'end station id', 'stop_date_hr', 'end_station_net_checkout'
    ]
    data = pd.merge(data,
                    end_station_checkout,
                    on=['end station id', 'stop_date_hr'],
                    how='left')

    feature_visualization = False
    plt.close('all')
    if feature_visualization:
        sns.distplot(data['start station latitude'])
        sns.distplot(data['start station longitude'])
        sns.distplot(data.start_area_net_checkout)
        sns.distplot(data.end_area_net_checkout)
        sns.distplot(data.start_station_net_checkout)
        sns.distplot(data.end_station_net_checkout)
        sns.distplot(data.distance)
        sns.distplot(data['distance'].apply(lambda x: math.log(x * 100)))
        sns.distplot(data.speed)

    # customer feature normalization
    data_customer_std = pd.DataFrame()
    data_customer_std['hr_x'] = data['start_hr'].apply(
        lambda hour: math.sin(2 * math.pi * hour / 24))
    data_customer_std['hr_y'] = data['start_hr'].apply(
        lambda hour: math.cos(2 * math.pi * hour / 24))
    col = 'distance'
    data_customer_std[col] = data[col].apply(lambda x: math.log(x * 100))
    # col='start_weekday'
    # data_customer_std[col]= data[col].apply(lambda x: 1 if x>=6 else 0)
    data_customer_std['weekday_x'] = data['start_weekday'].apply(
        lambda day: math.sin(2 * math.pi * day / 7))
    data_customer_std['weekday_y'] = data['start_weekday'].apply(
        lambda day: math.cos(2 * math.pi * day / 7))

    col_list = [
        'distance', 'start station latitude', 'start station longitude',
        'end station latitude', 'end station longitude',
        'start_area_net_checkout', 'end_area_net_checkout',
        'start_station_net_checkout', 'end_station_net_checkout'
    ]
    data_customer_std.loc[:, col_list] = data[col_list]
    data_customer_std.fillna(0, inplace=True)
    for col in data_customer_std.columns:
        data_customer_std[col] = data_customer_std[col] / np.std(
            data_customer_std[col])
    # sns.violinplot(data=data_customer_std,orient='h')

    dimension_reduction = False
    ## dimension reduction for visualization
    if dimension_reduction:
        # pca
        pca_plot = True
        if pca_plot:
            pca = PCA()
            data_customer_pca = pca.fit_transform(data_customer_std)
            fig = plt.figure(figsize=(12, 8))
            plt.scatter(data_customer_pca[:, 0], data_customer_pca[:, 1], s=1)
            plt.xlabel('pca feature 1')
            plt.ylabel('pca feature 2')
            plt.title('pca dimension reduction 2D')

            # pca.explained_variance_
            # pca_components=pd.DataFrame(pca.components_)
            # pca_components.columns=data_customer.columns
            # ax = pca_components.plot(kind='bar',stacked=True)
            # ax.legend(loc=1,fontsize=8)
            plt.savefig(os.path.join(root_path, r'plots/202008_pca_2D.png'))

        tsne_plot = False
        if tsne_plot:
            # t-SNE
            tsne = TSNE(random_state=42,
                        n_components=3,
                        verbose=0,
                        perplexity=40,
                        n_iter=400).fit_transform(data_customer_std)
            # 2D
            fig = plt.figure(figsize=(12, 8))
            plt.scatter(tsne[:, 0], tsne[:, 1], s=1)
            plt.xlabel('tsne feature 1')
            plt.ylabel('tsne feature 2')
            plt.title('tSNE dimension reduction 2D')
            plt.savefig(os.path.join(root_path, r'plots/202008_tsne_2D.png'))
            # 3D
            fig = plt.figure(figsize=(12, 8))
            ax = fig.add_subplot(111, projection='3d')
            ax.scatter(tsne[:, 0], tsne[:, 1], tsne[:, 2], s=1)
            ax.set_xlabel('tsne feature 1')
            ax.set_ylabel('tsne feature 2')
            ax.set_zlabel('tsne feature 3')
            plt.title('tSNE dimension reduction 3D')
            plt.savefig(os.path.join(root_path, r'plots/202008_tsne_3D.png'))
            plt.close('all')

        # umap
        umap_plot = False
        if umap_plot:
            fig, ax = plt.subplots(3, 2)
            fig.set_size_inches(10, 20)
            for i, n in enumerate([10, 50, 100]):
                embedding_corr = umap.UMAP(
                    n_neighbors=n, min_dist=0.3,
                    metric='correlation').fit_transform(data_customer_std)

                ax[i, 0].scatter(embedding_corr[:, 0],
                                 embedding_corr[:, 1],
                                 edgecolor='none',
                                 alpha=0.80,
                                 s=10)
                ax[i, 0].set_xlabel('umap feature 1')
                ax[i, 0].set_ylabel('umap feature 2')
                ax[i, 0].set_title(
                    f'umap dimension reduction_corr metrics_{n}_neighbors')

                embedding_dist = umap.UMAP(
                    n_neighbors=n, min_dist=0.3,
                    metric='euclidean').fit_transform(data_customer_std)

                ax[i, 1].scatter(embedding_dist[:, 0],
                                 embedding_dist[:, 1],
                                 edgecolor='none',
                                 alpha=0.80,
                                 s=10)
                ax[i, 1].set_xlabel('umap feature 1')
                ax[i, 1].set_ylabel('umap feature 2')
                ax[i, 1].set_title(
                    f'umap dimension reduction_euclidean metrics_{n}_neighbors'
                )

            plt.suptitle('umap visualization')
            plt.savefig(
                os.path.join(root_path,
                             r'plots/202008_umap_visualization.png'))
            plt.close('all')

    clustering = True
    if clustering:
        ## clustering
        # k-means
        data_customer_std_sample = copy.deepcopy(data_customer_std.loc[::1, :])
        num_max = 4
        clustering_kmeans = True
        if clustering_kmeans:
            start_time = time.process_time()
            kmeans_labels_agg = {}
            sil_scores_kmeans_agg = {}
            ch_scores_kmeans_agg = {}
            for num in range(2, num_max + 1):
                kmeans = KMeans(n_clusters=num, random_state=0)
                kmeans_labels_agg[num] = kmeans.fit_predict(
                    data_customer_std_sample)
                sil_scores_kmeans_agg[num] = metrics.silhouette_score(
                    data_customer_std_sample, kmeans_labels_agg[num])
                ch_scores_kmeans_agg[num] = metrics.calinski_harabasz_score(
                    data_customer_std_sample, kmeans_labels_agg[num])
            # pd.DataFrame.from_dict(sil_scores_kmeans_agg.values()).plot()

        clustering_hierachy = True
        if clustering_hierachy:
            start_time = time.process_time()
            ward_labels_agg = {}
            sil_scores_ward_agg = {}
            ch_scores_ward_agg = {}
            for num in range(2, num_max + 1):
                ward_clustering = AgglomerativeClustering(
                    n_clusters=num,
                    linkage='ward').fit(data_customer_std_sample)
                ward_labels_agg[num] = ward_clustering.labels_

                sil_scores_ward_agg[num] = metrics.silhouette_score(
                    data_customer_std_sample, ward_labels_agg[num])
                ch_scores_ward_agg[num] = metrics.calinski_harabasz_score(
                    data_customer_std_sample, ward_labels_agg[num])
                print(
                    f'ward clustering takes time {time.process_time() - start_time}'
                )
            # pd.DataFrame.from_dict(sil_scores_ward_agg.values()).plot()

        clustering_gmm = True
        if clustering_gmm:
            start_time = time.process_time()
            gmm_labels_agg = {}
            sil_scores_gmm_agg = {}
            ch_scores_gmm_agg = {}
            for num in range(2, num_max + 1):
                gmm_clustering = GaussianMixture(
                    n_components=num).fit(data_customer_std_sample)
                gmm_labels_agg[num] = gmm_clustering.predict(
                    data_customer_std_sample)
                sil_scores_gmm_agg[num] = metrics.silhouette_score(
                    data_customer_std_sample, gmm_labels_agg[num])
                ch_scores_gmm_agg[num] = metrics.calinski_harabasz_score(
                    data_customer_std_sample, gmm_labels_agg[num])
                print(
                    f'gmm clustering takes time {time.process_time() - start_time}'
                )

        umap_clustering = True
        if umap_clustering:
            embedding_corr = umap.UMAP(
                n_neighbors=10, min_dist=0.3,
                metric='correlation').fit_transform(data_customer_std_sample)

            start_time = time.process_time()
            kmeans_labels_umap = {}
            sil_scores_kmeans_umap = {}
            ch_scores_kmeans_umap = {}
            for num in range(2, num_max + 1):
                kmeans = KMeans(n_clusters=num, random_state=0)
                kmeans_labels_umap[num] = kmeans.fit_predict(embedding_corr)
                sil_scores_kmeans_umap[num] = metrics.silhouette_score(
                    data_customer_std_sample, kmeans_labels_umap[num])
                ch_scores_kmeans_umap[num] = metrics.calinski_harabasz_score(
                    data_customer_std_sample, kmeans_labels_umap[num])

            start_time = time.process_time()
            ward_labels_umap = {}
            sil_scores_ward_umap = {}
            ch_scores_ward_umap = {}
            for num in range(2, num_max + 1):
                ward_clustering = AgglomerativeClustering(
                    n_clusters=num, linkage='ward').fit(embedding_corr)
                ward_labels_umap[num] = ward_clustering.labels_
                sil_scores_ward_umap[num] = metrics.silhouette_score(
                    data_customer_std_sample, ward_labels_umap[num])
                ch_scores_ward_umap[num] = metrics.calinski_harabasz_score(
                    data_customer_std_sample, ward_labels_umap[num])
                print(
                    f'ward clustering takes time {time.process_time() - start_time}'
                )

            start_time = time.process_time()
            gmm_labels_umap = {}
            sil_scores_gmm_umap = {}
            ch_scores_gmm_umap = {}
            for num in range(2, num_max + 1):
                gmm_clustering = GaussianMixture(
                    n_components=3).fit(embedding_corr)
                gmm_labels_umap[num] = gmm_clustering.predict(embedding_corr)
                sil_scores_gmm_umap[num] = metrics.silhouette_score(
                    data_customer_std_sample, gmm_labels_umap[num])
                ch_scores_gmm_umap[num] = metrics.calinski_harabasz_score(
                    data_customer_std_sample, gmm_labels_umap[num])
                print(
                    f'gmm clustering takes time {time.process_time() - start_time}'
                )

        plot_hierachy_linkage = False
        if plot_hierachy_linkage:
            ward_clustering_full = AgglomerativeClustering(
                distance_threshold=0,
                n_clusters=None).fit(data_customer_std_sample)
            linkage = hierarchy.linkage(ward_clustering_full.children_, 'ward')
            plt.figure(figsize=(10, 7))
            dn = hierarchy.dendrogram(linkage)
            # plot_dendrogram(ward_clustering_full,truncate_mode='level', p=3)

        plot_clustering_2D = False
        if plot_clustering_2D:
            embedding_corr = umap.UMAP(
                n_neighbors=10, min_dist=0.3,
                metric='correlation').fit_transform(data_customer_std_sample)

            # labels=ward_labels_agg[4]
            labels = ward_labels_umap[2]
            # visualize clustering
            fig = plt.figure(figsize=(12, 8))
            plt.scatter(embedding_corr[:, 0],
                        embedding_corr[:, 1],
                        edgecolor='none',
                        alpha=0.80,
                        s=10,
                        c=labels)
            plt.xlabel('umap feature 1')
            plt.ylabel('umap feature 2')
            # plt.title(f'umap visualization with kmeans clustering labelling')
            # plt.savefig(os.path.join(root_path,r'plots/202008_umap_visualization_kmeans_clustering.png'))

            plt.title(
                f'umap visualization with ward hierachy clustering labelling')
            plt.savefig(
                os.path.join(
                    root_path,
                    r'plots/202008_umap_visualization_ward_clustering.png'))

        plot_clustering_feature_detail = True
        if plot_clustering_feature_detail:
            # analyze feature importance
            labels_dict = {
                0: kmeans_labels_agg,
                1: ward_labels_agg,
                2: gmm_labels_agg,
                3: kmeans_labels_umap,
                4: ward_labels_umap,
                5: gmm_labels_umap
            }
            labels_str_dict = {
                0: 'kmeans',
                1: 'ward',
                2: 'gmm',
                3: 'kmeans_umap',
                4: 'ward_umap',
                5: 'gmm_umap'
            }

            for type in range(0, 3):
                for cluster_num in range(2, num_max + 1):
                    # cluster_num=4
                    col_select = [
                        'start station longitude', 'start station latitude',
                        'end station latitude', 'end station longitude',
                        'start_hr', 'start_weekday', 'start_area_net_checkout',
                        'end_area_net_checkout', 'start_station_net_checkout',
                        'end_station_net_checkout'
                    ]
                    # fig, ax = plt.subplots(len(col_select), cluster_num)
                    # fig.set_size_inches(5 * cluster_num, 20)
                    # plt.suptitle('clustering feature analysis')
                    # plt.tight_layout()
                    #
                    # labels = labels_dict[type][cluster_num]
                    # df_customer_cluster = {}
                    #
                    #
                    # for cluster_i in range(0, cluster_num):
                    #     print(f'analyze cluster {cluster_i}')
                    #     mask_i = np.argwhere(labels == cluster_i).ravel()
                    #     mask_i_original = data_customer_std_sample.iloc[mask_i].index
                    #     df_customer_cluster[cluster_i] = data.loc[mask_i_original].copy()
                    #     for i, col in enumerate(col_select):
                    #         ax[i, cluster_i] = sns.histplot(ax=ax[i, cluster_i],
                    #                                         data=df_customer_cluster[cluster_i][col], kde=True)
                    #
                    # plt.savefig(os.path.join(root_path, r'plots',
                    #                          f'202008_clustering feature analysis_{labels_str_dict[type]}_{cluster_num}.png'))

                    labels = labels_dict[type][cluster_num]
                    df_customer_cluster = {}

                    gs_kw = dict(height_ratios=[1.5, 4, 4, 2, 2, 1.5])
                    fig, ax = plt.subplots(6,
                                           cluster_num,
                                           constrained_layout=True,
                                           gridspec_kw=gs_kw,
                                           figsize=(8 * cluster_num, 30))

                    #plt.tight_layout(pad=8)
                    for cluster_i in range(0, cluster_num):
                        print(
                            f'analyze cluster {cluster_i} of {labels_str_dict[type]}'
                        )
                        mask_i = np.argwhere(labels == cluster_i).ravel()
                        mask_i_original = data_customer_std_sample.iloc[
                            mask_i].index
                        df_customer_cluster[cluster_i] = copy.deepcopy(
                            data.loc[mask_i_original])
                        df_customer_cluster[cluster_i][
                            'start area'] = df_customer_cluster[cluster_i][
                                'start area'].apply(lambda x: x.split('-')[0])
                        df_customer_cluster[cluster_i][
                            'end area'] = df_customer_cluster[cluster_i][
                                'end area'].apply(lambda x: x.split('-')[0])

                        #bar plot starting area

                        ###question: appearance of the rides in general: by boro
                        df1 = pd.DataFrame(
                            (df_customer_cluster[cluster_i]
                             ['start boro'].value_counts())).sort_values(
                                 by='start boro', ascending=False)

                        df2 = pd.DataFrame(
                            (df_customer_cluster[cluster_i]
                             ['end boro'].value_counts())).sort_values(
                                 by='end boro', ascending=False)

                        df_boro = pd.merge(df1,
                                           df2,
                                           how='outer',
                                           left_index=True,
                                           right_index=True)
                        df_boro.plot(kind='bar', ax=ax[0, cluster_i])
                        ax[0, cluster_i].title.set_text(
                            f'Rides occurrence by borough in cluster {cluster_i+1}'
                        )

                        ##appearance of the rides by area
                        df_weekday = df_customer_cluster[cluster_i][
                            df_customer_cluster[cluster_i]
                            ['start_weekday'].isin([1, 2, 3, 4, 5])]
                        df1 = pd.DataFrame(
                            (df_weekday['start area'].value_counts()
                             )).sort_values(by='start area', ascending=False)
                        df2 = pd.DataFrame(
                            (df_weekday['end area'].value_counts()
                             )).sort_values(by='end area', ascending=False)
                        df_area = pd.merge(df1,
                                           df2,
                                           how='outer',
                                           left_index=True,
                                           right_index=True)
                        df_area['start area'].fillna(0, inplace=True)
                        df_area['end area'].fillna(0, inplace=True)
                        df_area.sort_values('start area',
                                            ascending=True,
                                            inplace=True)

                        df_area.plot(kind='barh', ax=ax[1, cluster_i])
                        ax[1, cluster_i].tick_params(labelsize=8)
                        ax[1, cluster_i].title.set_text(
                            f'Rides occurrence by area (weekday) in cluster {cluster_i+1}'
                        )

                        #weekend
                        df_weekend = df_customer_cluster[cluster_i][
                            df_customer_cluster[cluster_i]
                            ['start_weekday'].isin([6, 7])]
                        df1 = pd.DataFrame(
                            (df_weekend['start area'].value_counts()
                             )).sort_values(by='start area', ascending=False)
                        df2 = pd.DataFrame(
                            (df_weekend['end area'].value_counts()
                             )).sort_values(by='end area', ascending=False)
                        df_area = pd.merge(df1,
                                           df2,
                                           how='outer',
                                           left_index=True,
                                           right_index=True)
                        df_area['start area'].fillna(0, inplace=True)
                        df_area['end area'].fillna(0, inplace=True)
                        df_area.sort_values('start area',
                                            ascending=True,
                                            inplace=True)
                        df_area.plot(kind='barh', ax=ax[2, cluster_i])
                        ax[2, cluster_i].tick_params(labelsize=8)
                        ax[2, cluster_i].title.set_text(
                            f'Rides occurrence by area (weekend) in cluster {cluster_i+1}'
                        )

                        ##appearance of the rides by hour
                        df_start_time_raw = df_customer_cluster[cluster_i][[
                            'start_weekday', 'start_hr'
                        ]].groupby(['start_weekday', 'start_hr']).size()
                        df_start_time = df_start_time_raw.reset_index()
                        df_start_time.columns = [
                            'ride_day', 'ride_hr', 'count'
                        ]
                        #weekay rides
                        df_start_time_weekday = df_start_time.loc[
                            df_start_time['ride_day'].isin([1, 2, 3, 4, 5])]
                        sns.barplot(data=df_start_time_weekday,
                                    x="ride_hr",
                                    y='count',
                                    hue='ride_day',
                                    palette='husl',
                                    ax=ax[3, cluster_i])
                        ax[3, cluster_i].title.set_text(
                            f'Rides occurrence by hour (weekday) in cluster {cluster_i+1}'
                        )
                        #weekend rides
                        df_start_time_weekend = df_start_time.loc[
                            df_start_time['ride_day'].isin([6, 7])]
                        sns.barplot(data=df_start_time_weekend,
                                    x="ride_hr",
                                    y='count',
                                    hue='ride_day',
                                    palette='husl',
                                    ax=ax[4, cluster_i])
                        ax[4, cluster_i].title.set_text(
                            f'Rides occurrence by hour (weekend) in cluster {cluster_i+1}'
                        )

                        df_customer_cluster[cluster_i].usertype.value_counts(
                        ).plot(kind='bar', ax=ax[5, cluster_i])
                        ax[5, cluster_i].title.set_text(
                            f'Rides occurrence by usertype in cluster {cluster_i + 1}'
                        )
                        # ax=sns.histplot(df_start_time, x="ride_hr",binwidth=1, y='count', hue='ride_day',palette="pastel",unstack)
                        # sns.histplot(df_start_time, x="ride_hr",binwidth=1, y='count',hue='ride_day',ax=ax[2, cluster_i])
                        # ax[2, cluster_i].title.set_text(f'Day/Time of the rides in cluster {cluster_i+1}')
                        #
                        # #bar plot start and end area demand comparison
                        # df_val = df_customer_cluster[cluster_i][
                        #     ['start_area_net_checkout', 'end_area_net_checkout']].groupby(
                        #     ['start_area_net_checkout', 'end_area_net_checkout']).size()
                        # df_checkout = df_customer_cluster[cluster_i][
                        #     ['start_area_net_checkout', 'end_area_net_checkout']].copy()
                        # df_checkout.dropna(inplace=True)
                        # df_checkout['val'] = df_checkout.apply(lambda x: df_val[x.iloc[0]][x.iloc[1]], axis=1)
                        # df_checkout.plot.scatter(
                        #     x='start_area_net_checkout', y='end_area_net_checkout', s='val', ax=ax[6, cluster_i])
                        # ax[6, cluster_i].title.set_text(f'Net Checkouts Comparison of Start & End Area in {cluster_i+1}')

                        # plt.setp(ax[0, cluster_i].yaxis.get_majorticklabels(), fontsize=8)
                        # plt.setp(ax[1, cluster_i].yaxis.get_majorticklabels(), fontsize=8)
                        # plt.setp(ax[2, cluster_i].yaxis.get_majorticklabels(), fontsize=8)
                    plt.savefig(
                        os.path.join(
                            root_path, r'plots',
                            f'202008_{labels_str_dict[type]}_{cluster_num}_cluster_feature_detail.png'
                        ))

                    plt.close('all')

            for type in range(0, 3):
                for cluster_num in range(2, num_max + 1):
                    # geoplot
                    # type = 2
                    # cluster_num = 3
                    labels = labels_dict[type][cluster_num]
                    df_customer_cluster = {}
                    fig, ax = plt.subplots(cluster_num, 2)
                    fig.set_size_inches(15, 7 * cluster_num)
                    plt.tight_layout(pad=5)

                    for cluster_i in range(0, cluster_num):
                        print(f'analyze cluster {cluster_i}')
                        mask_i = np.argwhere(labels == cluster_i).ravel()
                        mask_i_original = data_customer_std_sample.iloc[
                            mask_i].index
                        df_customer_cluster[cluster_i] = data.loc[
                            mask_i_original].copy()
                        # df_customer_cluster[cluster_i] = pd.merge(df_customer_cluster[cluster_i],
                        #                                           area_concat('start', station_profile_gis[
                        #                                               ['station id', 'geometry']]),
                        #                                           how='left', on='start station id')
                        # df_customer_cluster[cluster_i] = pd.merge(df_customer_cluster[cluster_i],
                        #                                           area_concat('end', station_profile_gis[
                        #                                               ['station id', 'geometry']]),
                        #                                           how='left', on='end station id')

                        #df_customer_cluster[cluster_i]['weekend_flag']=df_customer_cluster[cluster_i]['start_weekday'].apply(lambda x: 1 if x>=6 else 0)

                        # counter_start = df_customer_cluster[cluster_i]['start area'].value_counts()
                        # counter_end = df_customer_cluster[cluster_i]['end area'].value_counts()
                        # df_customer_cluster[cluster_i]['start_area_net_checkout_median'] = df_customer_cluster[
                        #     cluster_i]['start area'].apply(lambda x: counter_start[x])
                        # df_customer_cluster[cluster_i]['end_area_net_checkout_median'] = df_customer_cluster[
                        #     cluster_i]['end area'].apply(lambda x: counter_end[x])

                        start_avg = df_customer_cluster[cluster_i][[
                            'start area', 'start_area_net_checkout'
                        ]].groupby(['start area']).mean()
                        start_avg_gis = gpd.GeoDataFrame(
                            pd.merge(start_avg.reset_index(),
                                     map[['boro', 'area', 'geometry']],
                                     how='inner',
                                     left_on='start area',
                                     right_on='area'),
                            geometry='geometry',
                            crs={
                                'init': 'epsg:4326',
                                'no_defs': True
                            })
                        map.plot(ax=ax[cluster_i, 0],
                                 figsize=(8, 8),
                                 alpha=0.5,
                                 edgecolor='k')
                        start_avg_gis.plot(column='start_area_net_checkout',
                                           ax=ax[cluster_i, 0],
                                           legend=True)

                        end_avg = df_customer_cluster[cluster_i][[
                            'end area', 'end_area_net_checkout'
                        ]].groupby(['end area']).mean()
                        end_avg_gis = gpd.GeoDataFrame(pd.merge(
                            end_avg.reset_index(),
                            map[['boro', 'area', 'geometry']],
                            how='inner',
                            left_on='end area',
                            right_on='area'),
                                                       geometry='geometry',
                                                       crs={
                                                           'init': 'epsg:4326',
                                                           'no_defs': True
                                                       })

                        map.plot(ax=ax[cluster_i, 1],
                                 figsize=(8, 8),
                                 alpha=0.5,
                                 edgecolor='k')
                        end_avg_gis.plot(column='end_area_net_checkout',
                                         ax=ax[cluster_i, 1],
                                         legend=True)

                        ax[cluster_i, 0].title.set_text(
                            f'Start Station Net Checkouts in cluster {cluster_i + 1}'
                        )
                        ax[cluster_i, 1].title.set_text(
                            f'End Station Net Checkouts in cluster {cluster_i + 1}'
                        )
                    plt.savefig(
                        os.path.join(
                            root_path, r'plots',
                            f'202008_{labels_str_dict[type]}_{cluster_num}_station_detail.png'
                        ))

                    plt.close('all')

    run_classification = True
    if run_classification:
        y = gmm_labels_agg[3]

        from sklearn.model_selection import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(
            data_customer_std_sample, y, test_size=0.2)

        from sklearn.naive_bayes import GaussianNB
        from sklearn.metrics import accuracy_score
        gnb = GaussianNB()
        gnb.fit(x_train, y_train)
        y_predict = gnb.predict(x_test)
        y_combo = list(zip(y_test, y_predict))
        accuracy_score(y_test, y_predict)
示例#20
0
f, ax = plt.subplots(1)
# Plot polygons in light grey
gpd.plotting.plot_polygon_collection(ax,
                                     bgm['geometry'],
                                     facecolor='grey',
                                     alpha=0.25,
                                     linewidth=0.1)

gpd.plotting.plot_polygon_collection(ax,
                                     gdfpol['geometry'],
                                     facecolor=None,
                                     edgecolor='green',
                                     linewidth=0.1)

f

geoplot.polyplot(f, ax=ax)

ax = geoplot.kdeplot(gdfpts_sub,
                     shade=True,
                     shade_lowest=False,
                     cmap="coolwarm",
                     clip=bgm.geometry)
geoplot.polyplot(bgm, ax=ax)

ax = geoplot.pointplot(gdfpts_sub)
geoplot.polyplot(bgm, ax=ax)

#optionally write away info
df['properties.title'].to_csv('NLD_S2-L2_st31_190626.csv', index=False)
示例#21
0
    gplt.datasets.get_path('nyc_injurious_collisions'))

fig = plt.figure(figsize=(10, 5))
proj = projection = gcrs.AlbersEqualArea(central_latitude=40.7128,
                                         central_longitude=-74.0059)
ax1 = plt.subplot(121, projection=proj)
ax2 = plt.subplot(122, projection=proj)

gplt.polyplot(nyc_boroughs, ax=ax1, projection=proj)
gplt.pointplot(nyc_fatal_collisions,
               projection=proj,
               hue='BOROUGH',
               cmap='Set1',
               edgecolor='white',
               linewidth=0.5,
               scale='NUMBER OF PERSONS KILLED',
               limits=(2, 8),
               legend=True,
               legend_var='scale',
               legend_kwargs={'loc': 'upper left'},
               legend_values=[2, 1],
               legend_labels=['2 Fatalities', '1 Fatality'],
               ax=ax1)
ax1.set_title("Fatal Crashes in New York City, 2016")

gplt.polyplot(nyc_boroughs, ax=ax2, projection=proj)
gplt.pointplot(nyc_injurious_collisions,
               projection=proj,
               hue='BOROUGH',
               cmap='Set1',
               edgecolor='white',
               linewidth=0.5,
import geopandas as gpd
import geoplot
import matplotlib.pyplot as plt
import contextily as ctx

df_quakes = gpd.read_file("lastday.json")
df_quakes = df_quakes[df_quakes["mag"] != "-"]
df_quakes["mag_num"] = df_quakes["mag"].astype(float)
df_quakes = df_quakes[df_quakes.mag_num > 0]

extent = (950000, 2000000, 5800000, 6300000)
df_quakes.to_crs(epsg=3857)
ax = geoplot.pointplot(df_quakes,
                       color="red",
                       scale="mag_num",
                       limits=(0.5, 1.5))
ax.axis(extent)
ctx.add_basemap(ax, source=ctx.providers.Stamen.TonerLite, zoom=6)
plt.show()

#source=ctx.providers.BasemapAT.grau
示例#23
0
"""
KDEPlot of Boston AirBnB Locations
==================================

This example demonstrates a combined application of ``kdeplot`` and ``pointplot`` to a
dataset of AirBnB locations in Boston. The result is outputted to a webmap using the nifty
``mplleaflet`` library. We sample just 1000 points, which captures the overall trend without
overwhelming the renderer.

`Click here to see this plot as an interactive webmap. 
<http://bl.ocks.org/ResidentMario/868ac097d671df1ed5ec83eed048560c>`_
"""

import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt
import mplleaflet

boston_airbnb_listings = gpd.read_file(
    gplt.datasets.get_path('boston_airbnb_listings'))

ax = gplt.kdeplot(boston_airbnb_listings, cmap='Greens')
gplt.pointplot(boston_airbnb_listings.sample(1000), color='darkgreen', ax=ax)
fig = plt.gcf()
plt.savefig("boston-airbnb-kde.png", bbox_inches='tight', pad_inches=0.1)

# mplleaflet.show(fig)
"""
Many applications involve working with geographical data. For example, when
tracking global weather, we might want to plot the temperature as measured by
various sensors around the world at their position on a map. For this, we can
use the GeoPandas package and the Geoplot package, both of which allow us to
manipulate, analyze, and visualize geographical data.

This module illustrates how to use GeoPandas and Geoplot packages to load and
visualize some sample geographical data.
"""
import geoplot
import geopandas
import matplotlib.pyplot as plt

world = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))

cities = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_cities"))

fig, ax = plt.subplots()
geoplot.polyplot(world, ax=ax)
geoplot.pointplot(cities, ax=ax, fc="r", marker="2")
ax.axis((-180, 180, -90, 90))

plt.show()
示例#25
0
import geopandas as gpd
from quilt.data.ResidentMario import geoplot_data

continental_cities = gpd.read_file(geoplot_data.usa_cities()).query('POP_2010 > 100000')
continental_usa = gpd.read_file(geoplot_data.contiguous_usa())


# Plot the figure.
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt

poly_kwargs = {'linewidth': 0.5, 'edgecolor': 'gray', 'zorder': -1}
point_kwargs = {'linewidth': 0.5, 'edgecolor': 'black', 'alpha': 1}
legend_kwargs = {'bbox_to_anchor': (0.9, 0.9), 'frameon': False}

ax = gplt.polyplot(continental_usa,
                   projection=gcrs.AlbersEqualArea(central_longitude=-98, central_latitude=39.5),
                   **poly_kwargs)

gplt.pointplot(continental_cities, projection=gcrs.AlbersEqualArea(), ax=ax,
               scale='POP_2010', limits=(1, 80),
               hue='POP_2010', cmap='Blues',
               legend=True, legend_var='scale',
               legend_values=[8000000, 6000000, 4000000, 2000000, 100000],
               legend_labels=['8 million', '6 million', '4 million', '2 million', '100 thousand'],
               legend_kwargs=legend_kwargs,
               **point_kwargs)

plt.title("Large cities in the contiguous United States, 2010")
plt.savefig("largest-cities-usa.png", bbox_inches='tight', pad_inches=0.1)
示例#26
0
    def test_pointplot(self):
        try:
            gplt.pointplot(series_gaussian_points, k=2)
            gplt.pointplot(dataframe_gaussian_points, k=2)

            gplt.pointplot(dataframe_gaussian_points, hue=list_hue_values, k=None)
            gplt.pointplot(dataframe_gaussian_points, hue=series_hue_values, k=None)
            gplt.pointplot(dataframe_gaussian_points, hue=map_hue_values(), k=None)
            gplt.pointplot(dataframe_gaussian_points, hue='hue_var', k=None)
        finally:
            plt.close('all')
示例#27
0
"""
KDEPlot of Boston AirBnB Locations
==================================

This example demonstrates a combined application of ``kdeplot`` and ``pointplot`` to a
dataset of AirBnB locations in Boston. The result is outputted to a webmap using the nifty
``mplleaflet`` library. We sample just 1000 points, which captures the overall trend without
overwhelming the renderer.

`Click here to see this plot as an interactive webmap.
<https://bl.ocks.org/ResidentMario/868ac097d671df1ed5ec83eed048560c>`_
"""

import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt

boston_airbnb_listings = gpd.read_file(
    gplt.datasets.get_path('boston_airbnb_listings'))

ax = gplt.kdeplot(boston_airbnb_listings,
                  cmap='viridis',
                  projection=gcrs.WebMercator(),
                  figsize=(12, 12),
                  shade=True)
gplt.pointplot(boston_airbnb_listings, s=1, color='black', ax=ax)
gplt.webmap(boston_airbnb_listings, ax=ax)
plt.title('Boston AirBnB Locations, 2016', fontsize=18)
示例#28
0
contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa'))
scheme = mc.Quantiles(continental_usa_cities['POP_2010'], k=5)

ax = gplt.polyplot(
    contiguous_usa,
    zorder=-1,
    linewidth=1,
    projection=gcrs.AlbersEqualArea(),
    edgecolor='white',
    facecolor='lightgray',
    figsize=(8, 12)
)
gplt.pointplot(
    continental_usa_cities,
    scale='POP_2010',
    limits=(2, 30),
    hue='POP_2010',
    cmap='Blues',
    scheme=scheme,
    legend=True,
    legend_var='scale',
    legend_values=[8000000, 2000000, 1000000, 100000],
    legend_labels=['8 million', '2 million', '1 million', '100 thousand'],
    legend_kwargs={'frameon': False, 'loc': 'lower right'},
    ax=ax
)


plt.title("Large cities in the contiguous United States, 2010")
plt.savefig("largest-cities-usa.png", bbox_inches='tight', pad_inches=0.1)
plt.show()
示例#29
0
"""
This script tests whether the current environment works correctly or not.
"""

import sys
sys.path.insert(0, '../')
import geoplot as gplt
from geoplot import crs as gcrs
import geopandas as gpd

# cf. https://github.com/Toblerity/Shapely/issues/435

# Fiona/Shapely/Geopandas test.
cities = gpd.read_file("../data/cities/citiesx010g.shp")
census_tracts = gpd.read_file(
    "../data/nyc_census_tracts/census_tracts_2010.geojson")

# Cartopy test.
gplt.pointplot(cities.head(50), extent=(10, 20, 10, 20))
示例#30
0
injurious_collisions = gpd.read_file(geoplot_data.nyc_injurious_collisions())


# Plot the data.
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10,5))

ax1 = plt.subplot(121, projection=gcrs.AlbersEqualArea(central_latitude=40.7128, central_longitude=-74.0059))
gplt.polyplot(boroughs, ax=ax1, projection=gcrs.AlbersEqualArea())
gplt.pointplot(fatal_collisions, projection=gcrs.AlbersEqualArea(),
               hue='BOROUGH', categorical=True,
               edgecolor='white', linewidth=0.5, zorder=10,
               scale='NUMBER OF PERSONS KILLED', limits=(2, 8),
               legend=True, legend_var='scale', legend_kwargs={'loc': 'upper left'},
               legend_values=[2, 1], legend_labels=['2 Fatalities', '1 Fatality'],
               ax=ax1)
plt.title("Fatal Crashes in New York City, 2016")

ax2 = plt.subplot(122, projection=gcrs.AlbersEqualArea(central_latitude=40.7128, central_longitude=-74.0059))
gplt.polyplot(boroughs, ax=ax2, projection=gcrs.AlbersEqualArea())
gplt.pointplot(injurious_collisions, projection=gcrs.AlbersEqualArea(),
               hue='BOROUGH', categorical=True,
               edgecolor='white', linewidth=0.5, zorder=10,
               scale='NUMBER OF PERSONS INJURED', limits=(1, 10),
               legend=True, legend_var='scale', legend_kwargs={'loc': 'upper left'},
               legend_values=[20, 15, 10, 5, 1],
               legend_labels=['20 Injuries', '15 Injuries', '10 Injuries', '5 Injuries', '1 Injury'],
               ax=ax2)
示例#31
0
                                 sep=",",
                                 header=0)
HUC8 = geopandas.GeoDataFrame.from_file('watershed/huc8sum.shp')
HUC8.crs = {'init': 'epsg:4326'}

#Ecoregions
Ecoregions = geopandas.GeoDataFrame.from_file(
    'NARS_NP_values/narswsa_20110504.shp')
Ecoregions.crs = {'init': 'epsg:4326'}

#NARS points
proj = gcrs.AlbersEqualArea()
ax = gplt.polyplot(HUC8, projection=proj)
gplt.pointplot(gdf_NARS_site_info_UIDindex,
               ax=ax,
               projection=proj,
               s=1,
               color='red')
plt.savefig('NARS_locations.pdf')
plt.savefig("NARS_locations.png", bbox_inches='tight', pad_inches=0.1)

#gdf_NARS_site_info_UIDindex_renamed = gdf_NARS_site_info_UIDindex.rename(columns={'HUC8':'HUC_8'})

# =============================================================================
# NARS (rivers and streams)
# =============================================================================

# UID_HUC8_dict
UID_HUC8_dict = {
    key: NARS_site_info['HUC8'][NARS_site_info.UID == key].values
    for key in NARS_site_info['UID'].values
y_pred = dt.predict(lakeData5)
classification = pd.DataFrame(data=y_pred, columns=['intermittent'])
classifiedData = glakeData.copy()
classifiedData['intermittent'] = classification['intermittent'].values
intermittentLakes5 = classifiedData[(classifiedData['intermittent'] == 1)]
print(intermittentLakes5.shape)

#Plot current intermittent and annual lakes
ax = gplt.polyplot(world,
                   projection=gplt.crs.NorthPolarStereo(),
                   facecolor='whitesmoke',
                   figsize=(15, 15))

gplt.pointplot(annualLakes,
               color='black',
               ax=ax,
               s=0.5,
               label='Annual winter ice')
gplt.pointplot(intermittentLakes,
               color='tab:orange',
               ax=ax,
               s=0.5,
               label='Intermittent winter ice')
lgnd = plt.legend(loc="lower left", scatterpoints=1, fontsize=18)
lgnd.legendHandles[0]._sizes = [100]
lgnd.legendHandles[1]._sizes = [100]
plt.savefig('currentLakeMapSharmaMinGood.png', bbox_inches='tight')
plt.clf()

#Plot warming scenarioes
ax = gplt.polyplot(world,
                hue='avg_d_kbps',
                projection=gcrs.AlbersEqualArea(),
                cmap='Greens',
                legend=True,
                ax=ax)
plt.show()

# In[43]:

# use the location of the centroid of each polygon
data_2020['geometry'] = data_2020['geometry'].centroid

# In[56]:

ax = gplt.webmap(data_2020, projection=gcrs.WebMercator())
gplt.pointplot(data_2020, ax=ax, hue='avg_d_kbps', legend=True)
plt.show()

# In[53]:

ax = gplt.webmap(data_2020, projection=gcrs.WebMercator())
gplt.kdeplot(data_2020[['avg_d_kbps', 'geometry']],
             n_levels=50,
             cmap='Reds',
             thresh=0.05,
             shade=True,
             ax=ax)
plt.show()

# In[32]:
示例#34
0
    'linewidth': 0
}
pointplot_kwargs = {
    'projection': gcrs.AlbersEqualArea(),
    'scale': 'ELEV_IN_FT',
    'edgecolor': 'white',
    'linewidth': 0.5,
    'color': 'black'
}
ylim = (-1647757.3894385984, 1457718.4893930717)

# Our first plot is a default linear-scale one. We can see from the results that this is clearly the most appropriate
# one for this specific data.
gplt.polyplot(contiguous_usa.geometry, ax=axarr[0][0], **polyplot_kwargs)
gplt.pointplot(cities.query("POP_2010 > 10000"),
               ax=axarr[0][0],
               limits=(0.1, 10),
               **pointplot_kwargs)
axarr[0][0].set_title("Linear Scale")
axarr[0][0].set_ylim(ylim)


# Next, a trivial identity scale. This results in a plot where every city has the same size.
def identity_scale(minval, maxval):
    def scalar(val):
        return 2

    return scalar


gplt.polyplot(contiguous_usa.geometry, ax=axarr[0][1], **polyplot_kwargs)
gplt.pointplot(cities.query("POP_2010 > 10000"),
示例#35
0
polyplot_kwargs = {
    'projection': gcrs.AlbersEqualArea(), 'facecolor': (0.9, 0.9, 0.9),
    'zorder': -100, 'linewidth': 0
}
pointplot_kwargs = {
    'projection': gcrs.AlbersEqualArea(), 'scale': 'ELEV_IN_FT',
    'edgecolor': 'white', 'linewidth': 0.5, 'color': 'black'
}
ylim = (-1647757.3894385984, 1457718.4893930717)


# Our first plot is a default linear-scale one. We can see from the results that this is clearly the most appropriate
# one for this specific data.
gplt.polyplot(gpd.GeoSeries(continental_usa), ax=axarr[0][0], **polyplot_kwargs)
gplt.pointplot(cities.query("POP_2010 > 10000"), ax=axarr[0][0], limits=(0.1, 10), **pointplot_kwargs)
axarr[0][0].set_title("Linear Scale")
axarr[0][0].set_ylim(ylim)


# Next, a trivial identity scale. This results in a plot where every city has the same size.
def identity_scale(minval, maxval):
    def scalar(val):
        return 2
    return scalar

gplt.polyplot(gpd.GeoSeries(continental_usa), ax=axarr[0][1], **polyplot_kwargs)
gplt.pointplot(cities.query("POP_2010 > 10000"), ax=axarr[0][1], scale_func=identity_scale, **pointplot_kwargs)
axarr[0][1].set_title("Identity Scale")
axarr[0][1].set_ylim(ylim)