def test_pointplot(self, projection, hue_vars, scale_vars, legend_vars): kwargs = {'projection': projection} kwargs = {**kwargs, **hue_vars, **scale_vars, **legend_vars} try: gplt.pointplot(gaussian_points, **kwargs) finally: plt.close()
def plot_droughts_per_district(data, label_col='drought reported', district_col='District', path='../', country='Uganda', admin_level=1): droughts_per_district = data[[district_col, label_col ]].groupby(district_col).sum().reset_index() gdf_country = gpd.read_file(get_country_shapefile(path=path, country=country, admin_level=admin_level), crs='') gdf_country.rename(columns={'ADM1_EN': district_col}, inplace=True) gdf_country['centroid'] = gdf_country.centroid droughts_per_district = gdf_country[[district_col, 'geometry', 'centroid' ]].merge(droughts_per_district, on=district_col) droughts_per_district.set_geometry('centroid', drop=True, inplace=True) droughts_per_district = droughts_per_district[ droughts_per_district[label_col] > 0] geoplot.polyplot(gdf_country) ax = plt.gca() geoplot.pointplot(droughts_per_district, scale=label_col, color='darkred', marker='o', limits=(2, 14), legend=True, legend_values=[1, 3, 6, 9, 12], ax=ax) return
def _save_image(shape: gpd.GeoDataFrame, data: gpd.GeoDataFrame, output_file: str): fig, ax = plt.subplots(figsize=(6, 6)) gplt.polyplot(shape, ax=ax, zorder=1) gplt.pointplot(data, color="red", s=.5, ax=ax, zorder=2) shape_bounds = shape.total_bounds ax.set_ylim(shape_bounds[1], shape_bounds[3]) ax.set_xlim(shape_bounds[0], shape_bounds[2]) logging.info(f"Saving image to {output_file}") plt.savefig(output_file, bbox_inches='tight', pad_inches=0.1, dpi=300) # TODO: Solve "RuntimeWarning: More than 20 figures have been opened." plt.clf()
def draw_month(month): to_full_month = { 'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April', 'May': 'May', 'Jun': 'June', 'Jul': 'July' } frames = [] for i in range(1, 32): day_str = str(i) if i < 10: day_str = '0' + str(i) if os.path.exists(month + ' ' + day_str + '.csv'): df1 = pd.read_csv(month + ' ' + day_str + '.csv', header=None, names=[ 'id', 'longitude', 'latitude', 'location', 'created_at', 'lang' ]) frames.append(df1) df = pd.concat(frames) print(df.shape) mydict = dict(df.location.value_counts()) df['notnan'] = df['location'].notna() df['count'] = df.apply(lambda x: mydict[x.location] if x.notnan else 1, axis=1) df.drop_duplicates(subset='location', keep='first', inplace=True) gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude)) scheme = mc.Quantiles(df['count'], k=5) world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) ax = gplt.polyplot( world, edgecolor='white', facecolor='lightgray', ) gplt.pointplot(gdf, ax=ax, hue='count', cmap='Reds', scale='count', scheme=scheme, legend=True, legend_var='hue') ax.set_title('Discussion on Twitter, ' + to_full_month[month], fontsize=10) plt.savefig(month + '.png', dpi=1000)
def test_param_extent_unproj(): # invalid extent: raise with pytest.raises(ValueError): pointplot(p_df, extent=(-181, 0, 1, 1)) with pytest.raises(ValueError): pointplot(p_df, extent=(0, -91, 1, 1)) with pytest.raises(ValueError): pointplot(p_df, extent=(0, 0, 181, 1)) with pytest.raises(ValueError): pointplot(p_df, extent=(0, 0, 1, 91)) # valid extent: set return pointplot(p_df, hue='var', linewidth= 0, s=10, extent=(-10, -10, 10, 10)).get_figure()
def plot_point_map(gpd_gdf, percentile=0, save_file=None): """plot point data on a map""" # Choose points in which NSE value are bigger than the 25% quartile value range percentile_data = np.percentile(gpd_gdf['NSE'].values, percentile).astype(float) # the result of query is a tuple with one element, but it's right for plotting data_chosen = gpd_gdf.query("NSE > " + str(percentile_data)) contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) proj = gcrs.AlbersEqualArea(central_longitude=-98, central_latitude=39.5) polyplot_kwargs = {'facecolor': (0.9, 0.9, 0.9), 'linewidth': 0} pointplot_kwargs = {'hue': 'NSE', 'legend': True, 'linewidth': 0.01} # ax = gplt.polyplot(contiguous_usa.geometry, projection=proj, **polyplot_kwargs) ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) gplt.pointplot(data_chosen, ax=ax, **pointplot_kwargs) ax.set_title("NSE " + "Map") plt.show() if save_file is not None: plt.savefig(save_file)
def draw_per_day(date): if os.path.exists(date + '.csv') is False: return print(date) df = pd.read_csv(date + '.csv', header=None, names=[ 'id', 'longitude', 'latitude', 'location', 'created_at', 'lang' ]) print(date, df.shape) # assign count value mydict = dict(df.location.value_counts()) df['notnan'] = df['location'].notna() df['count'] = df.apply(lambda x: mydict[x.location] if x.notnan else 1, axis=1) df.drop_duplicates(subset='location', keep='first', inplace=True) gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude)) scheme = mc.Quantiles(df['count'], k=5) world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) ax = gplt.polyplot( world, edgecolor='white', facecolor='lightgray', ) gplt.pointplot(gdf, ax=ax, hue='count', cmap='Reds', scale='count', scheme=scheme, legend=True, legend_var='hue') ax.set_title('Discussion on Twitter, ' + date, fontsize=10) plt.savefig(date + '.png', dpi=1000)
def test_pointplot(self): try: gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), color='white') gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), s=5) gplt.pointplot(list_gaussian_points, projection=gcrs.PlateCarree(), legend_kwargs={'fancybox': False}) finally: plt.close()
#geospatial areas world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) Europe = (world.loc[world['continent'] == 'Europe']) USA = (world.loc[world['continent'] == 'North America']) Japan = (world.loc[world['name'] == 'Japan']) Korea = (world.loc[world['name'] == 'South Korea']) #geospatial pointplots ax = gplt.polyplot(world,linewidth=0.7) gplt.pointplot(gdf, hue = query, cmap = 'rainbow', k = 2, alpha = 0.8, scale = query, limits = (20, 20), legend = True, legend_values = [-1,1], legend_labels = ['negative','positive'], ax = ax) plt.title('Global Station Distribution For Stations With Valid Data Series') Eu_ax = gplt.polyplot(Europe,linewidth=0.7) gplt.pointplot(Eu_gdf, hue = query, cmap = 'rainbow', k = 2, alpha = 0.8, scale = query, limits = (30,30),
ax_hist2.set(ylabel='Japan-S.Korea') sns.despine(ax=ax_hist) sns.despine(ax=ax_hist1) sns.despine(ax=ax_hist2) #geospatial locations world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) Europe = (world.loc[world['continent'] == 'Europe']) USA = (world.loc[world['continent'] == 'North America']) Japan = (world.loc[world['name'] == 'Japan']) Korea = (world.loc[world['name'] == 'South Korea']) #plot of station distribution ax = gplt.polyplot(world, linewidth=0.7) gplt.pointplot(gdf, color='red', ax=ax) #plt.title('Global Station Distribution For Stations With Valid Data Series') #geospatial plots (titles are comented out) Eu_ax = gplt.polyplot(Europe, linewidth=0.7) gplt.pointplot(Eu_gdf, hue=query, cmap='plasma', k=None, alpha=1, scale=query, limits=(25, 25), legend=True, ax=Eu_ax)
df_map = gpd.GeoDataFrame.from_file('Virtual_Map1.shp') df_city = pd.read_csv("Virtual_City.csv") geom = gpd.GeoSeries( [Point(x, y) for x, y in zip(df_city.long.values, df_city.lat.values)]) df_city = gpd.GeoDataFrame(df_city, geometry=geom) #--------------------------------- (a)黑白沃罗诺伊图.---------------------------------------- ax1 = gplt.voronoi( df_city, #projection=gcrs.AlbersEqualArea(), clip=df_map, linewidth=0.5, #hue='orange', cmap='Reds',k=5, legend=False, edgecolor='k') ax2 = gplt.pointplot(df_city, color='white', s=10, edgecolors='k', ax=ax1) #zorder=2, gplt.polyplot(df_map, edgecolor='none', facecolor='lightgray', ax=ax1) #zorder=1, #plt.savefig('沃罗诺伊地图2.pdf') #--------------------------------- (b)彩色沃罗诺伊图.---------------------------------------- ax = gplt.voronoi( df_city, #projection=gcrs.AlbersEqualArea(), clip=df_map, hue='city', cmap='Set1', legend=True, k=10, edgecolor='w', alpha=0.75, legend_kwargs={
def test_scale_params(kwargs): return pointplot(p_df, **kwargs).get_figure()
def test_legend_params(kwargs): return pointplot(p_df, **kwargs).get_figure()
def test_pointplot(self): try: gplt.pointplot(series_gaussian_points, k=2) gplt.pointplot(dataframe_gaussian_points, k=2) gplt.pointplot(dataframe_gaussian_points, hue=list_hue_values, k=None) gplt.pointplot(dataframe_gaussian_points, hue=series_hue_values, k=None) gplt.pointplot(dataframe_gaussian_points, hue=map_hue_values(), k=None) gplt.pointplot(dataframe_gaussian_points, hue='hue_var', k=None) finally: plt.close('all')
""" Quadtree of NYC traffic collisions ================================== This example plots traffic collisions in New York City. Overlaying a ``pointplot`` on a ``quadtree`` like this communicates information on two visual channels, position and texture, simultaneously. """ import geopandas as gpd import geoplot as gplt import geoplot.crs as gcrs import matplotlib.pyplot as plt nyc_boroughs = gpd.read_file(gplt.datasets.get_path('nyc_boroughs')) collisions = gpd.read_file(gplt.datasets.get_path('nyc_collision_factors')) ax = gplt.quadtree(collisions, nmax=1, projection=gcrs.AlbersEqualArea(), clip=nyc_boroughs, facecolor='lightgray', edgecolor='white', zorder=0) gplt.pointplot(collisions, s=1, ax=ax) plt.title("New York Ciy Traffic Collisions, 2016")
def geospatial_viz(geo_data_url, point_data_url=None, att_var=None, map_type=None): ''' function to visualize the attribute information in map. (eg, population in states) geo_att_data: geodataframe that contains both geometry and attributes info att_var: the attributes to be visualized in the map map_type: string, the type of map to be viz. pointplot, choropleth, voronoi if point_data = None, att_var must be from geo_data ''' geo_data = gpd.read_file(geo_data_url) print(geo_data.head()) if point_data_url == 'No point attribute data': if att_var is None: ax = gplt.polyplot(geo_data, figsize=(10, 5)) ax.set_title('plain map of continental USA', fontsize=16) else: if map_type == 'choropleth': scheme = mc.FisherJenks(geo_data[att_var], k=5) labels = scheme.get_legend_classes() ax = gplt.polyplot(geo_data, projection=gcrs.AlbersEqualArea()) gplt.choropleth(geo_data, hue=att_var, edgecolor='white', linewidth=1, cmap='Blues', legend=True, scheme=scheme, legend_labels=labels, ax=ax) ax.set_title('{} in the continental US'.format(att_var), fontsize=16) if map_type == "cartogram": gplt.cartogram(geo_data, scale=att_var, edgecolor='black', projection=gcrs.AlbersEqualArea()) else: point_data = gpd.read_file(point_data_url) scheme = mc.Quantiles(point_data[att_var], k=5) labels = scheme.get_legend_classes() if map_type == 'pointplot': if isinstance(point_data.geometry[0], shapely.geometry.point.Point): ax = gplt.polyplot(geo_data, edgecolor='white', facecolor='lightgray', figsize=(12, 8) #projection = gcrs.AlbersEqualArea() ) gplt.pointplot(point_data, ax=ax, hue=att_var, cmap='Blues', scheme=scheme, scale=att_var, legend=True, legend_var='scale', legend_kwargs={"loc": 'lower right'}, legend_labels=labels) ax.set_title( 'Cities in the continental US, by population 2010', fontsize=16) else: print('Geometry data type not valid') if map_type == "voronoi": # check uniqueness of coordinates duplicates = point_data.geometry.duplicated() point_data_unique = point_data[-duplicates] proj = gplt.crs.AlbersEqualArea(central_longitude=-98, central_latitude=39.5) ax = gplt.voronoi(point_data_unique, hue=att_var, clip=geo_data, projection=proj, cmap='Blues', legend=True, edgecolor="white", linewidth=0.01) gplt.polyplot(geo_data, ax=ax, extent=geo_data.total_bounds, edgecolor="black", linewidth=1, zorder=1) plt.title("{} in US cities".format(att_var), fontsize=16)
# ## Making Maps # %% Collapsed="false" f, ax = plt.subplots(dpi = 200) countries.plot(edgecolor = 'k', facecolor = 'None', linewidth = 0.6, ax = ax) cities.plot(markersize = 0.5, facecolor = 'red', ax = ax) lat_am_capitals.plot(markersize = 0.5, facecolor = 'y', ax = ax) ax.set_title('World Map') ax.set_axis_off() # %% [markdown] Collapsed="false" # ## Static Webmaps # %% Collapsed="false" ax = gplt.webmap(countries, projection=gplt.crs.WebMercator(), figsize = (16, 12)) gplt.pointplot(cities, ax=ax, hue = 'POP2015') # %% [markdown] Collapsed="false" # ## Aside on Projections # %% [markdown] Collapsed="false" # Map projections flatten a globe's surface onto a 2D plane. This necessarily distorts the surface (one of Gauss' lesser known results), so one must choose specific form of 'acceptable' distortion. # # By convention, the standard projection in GIS is World Geodesic System(lat/lon - `WGS84`). This is a cylindrical projection, which stretches distances east-west and *results in incorrect distance and areal calculations*. For accurate distance and area calculations, try to use UTM (which divides map into zones). See [epsg.io](epsg.io) # %% Collapsed="false" countries.crs # %% Collapsed="false" countries_2 = countries.copy() countries_2 = countries_2.to_crs({'init': 'epsg:3035'})
def main(): use_trim = True update_data = True root_path = os.getcwd() cache_path = os.path.join(root_path, r'data/202008-citibike-tripdata-trimmed.pickle') cache_path_full = os.path.join( root_path, r'data/202008-citibike-tripdata-full.pickle') if (update_data == False) and (use_trim): data = pickle.load(open(cache_path, 'rb')) print(f'Loaded trimmed data from {cache_path}') elif (update_data == False) and (use_trim == False): data = pickle.load(open(cache_path_full, 'rb')) else: data_JC = pd.read_csv( os.path.join(root_path, r'data/JC-202008-citibike-tripdata.csv')) data_NY = pd.read_csv( os.path.join(root_path, r'data/202008-citibike-tripdata.csv')) print(f'Loaded full data') if use_trim: data_NY_part = data_NY[::100] # data=data_process_data(pd.concat([data_JC, data_NY_part])) data = data_process_data(copy.deepcopy(data_NY_part)) pickle.dump(data, open(cache_path, 'wb')) print(f'Use trim data, saved a cache into {cache_path}') else: data_NY_part = data_NY[::10] # data=data_process_data(pd.concat([data_JC, data_NY_part])) data = data_process_data(copy.deepcopy(data_NY_part)) pickle.dump(data, open(cache_path_full, 'wb')) print(f'Use full data, saved a cache into {cache_path_full}') mask = data[data['start station id'] == data['end station id']].index data = data.drop(index=mask) map_JC = gpd.read_file( os.path.join( root_path, r'Data/jersey-city-neighborhoods/jersey-city-neighborhoods.shp') ).to_crs(epsg=4326) map_JC = map_JC[['name', 'geometry']] map_JC['name'] = map_JC['name'].apply(lambda x: f'JC {x}') map_JC['region'] = 'JC' map_JC.columns = ['area', 'geometry', 'boro'] map_NY = gpd.read_file( os.path.join( root_path, r'Data/Neighborhood Tabulation Areas/NY neighborhoods.shp') ).to_crs(epsg=4326) map_NY = map_NY[['ntaname', 'geometry', 'boro_name']] map_NY.columns = ['area', 'geometry', 'boro'] map = pd.concat([map_JC, map_NY], ignore_index=True) map['centroid'] = map.geometry.centroid # EDA run_eda = False if run_eda: plt.close('all') data['start_hr'].value_counts(sort=False).plot(kind='bar') data['start_weekday'].value_counts(sort=False).plot(kind='bar') data['usertype'].value_counts(sort=False).plot(kind='bar') data.groupby('usertype')['start_weekday'].value_counts( sort=False).plot(kind='bar') data.groupby('usertype')['start_hr'].value_counts(sort=False).plot( kind='bar') ax = data[data['usertype'] == 'Subscriber'].groupby([ 'start_weekday' ])['start_hr'].value_counts(sort=False).plot(kind='bar') data[data['usertype'] == 'Customer'].groupby([ 'start_weekday' ])['start_hr'].value_counts(sort=False).plot(kind='bar', ax=ax, color='red') ax.xaxis.set_major_locator(ticker.NullLocator()) # Outlier on the first two days - need to remove # get map and station info with area station_profile = summarize_station(data) station_profile = gpd.GeoDataFrame( station_profile, geometry=gpd.points_from_xy(station_profile['station longitude'], station_profile['station latitude']), crs={ 'init': 'epsg:4326', 'no_defs': True }) station_profile_gis = gpd.sjoin(station_profile, map, how='left', op='within') # summarize net rides by station by hour data = pd.merge(data, area_concat( 'start', station_profile_gis[['station id', 'area', 'boro']]), how='left', on='start station id') data = pd.merge(data, area_concat( 'end', station_profile_gis[['station id', 'area', 'boro']]), how='left', on='end station id') # group by station rides_byStation = summary_ride(data, 'station id') rides_byStation_byHour = summarize_rides_by_hour(rides_byStation, 'station id') # group by area rides_byArea = summary_ride(data, 'area') # len(rides_byArea[rides_byArea['net_checkout'].apply(lambda x: isinstance(x, float)==False)]) rides_byArea_byHour = summarize_rides_by_hour(rides_byArea, 'area') # rides_byArea_byHour_gis=gpd.GeoDataFrame(rides_byArea_byHour.merge(map[['boro','area','centroid']], on='area'), geometry='centroid') rides_byArea_byHour_gis = gpd.GeoDataFrame(rides_byArea_byHour.merge( map[['boro', 'area', 'geometry']], on='area'), geometry='geometry') plot_rides_on_map = False if plot_rides_on_map: rides_byStation_byHour_gis = pd.merge(rides_byStation_byHour, station_profile_gis, on='station id') for i in range(0, 24): ax = map.plot(figsize=(8, 8), alpha=0.5, edgecolor='k') # rides_byStation_byHour_gis.plot(ax=ax, color='red', markersize=rides_byStation_byHour_gis[0]) select_hr = str(i) gplt.pointplot(rides_byStation_byHour_gis[[select_hr, 'geometry']], hue=select_hr, scale=select_hr, ax=ax, legend=True, legend_var='hue') plt.savefig( os.path.join(root_path, r'plots/202008_station_' + select_hr + '.png')) # lda/pca to reduce features plt.close('all') for i in range(0, 24): ax = map.plot(figsize=(8, 8), alpha=0.5, edgecolor='k') # rides_byStation_byHour_gis.plot(ax=ax, color='red', markersize=rides_byStation_byHour_gis[0]) select_hr = str(i) # gplt.pointplot(rides_byArea_byHour_gis[[select_hr, 'centroid']], hue=select_hr, scale=select_hr, ax=ax, legend=True, # legend_var='hue') rides_byArea_byHour_gis.plot(column=select_hr, ax=ax, legend=True) plt.savefig( os.path.join( root_path, r'plots/202008_area_choropleth_' + select_hr + '.png')) plt.close('all') data['distance'] = abs(data['end station longitude'] - data['start station longitude']) + abs( data['end station latitude'] - data['start station latitude']) data.drop(index=data[data['distance'] == 0].index, inplace=True) data['speed'] = data['distance'] / data['tripduration'] # data['start_area_net_checkout'] = data[['start area','start_date_hr']].apply( # lambda x: rides_byArea[((rides_byArea['area']==x.iloc[0]) & (rides_byArea['date_hour'] == x.iloc[1]))]['net_checkout']) start_area_checkout = rides_byArea[['area', 'date_hour', 'net_checkout']] start_area_checkout.columns = [ 'start area', 'start_date_hr', 'start_area_net_checkout' ] data = pd.merge(data, start_area_checkout, on=['start area', 'start_date_hr'], how='left') end_area_checkout = rides_byArea[['area', 'date_hour', 'net_checkout']] end_area_checkout.columns = [ 'end area', 'stop_date_hr', 'end_area_net_checkout' ] data = pd.merge(data, end_area_checkout, on=['end area', 'stop_date_hr'], how='left') start_station_checkout = rides_byStation[[ 'station id', 'date_hour', 'net_checkout' ]] start_station_checkout.columns = [ 'start station id', 'start_date_hr', 'start_station_net_checkout' ] data = pd.merge(data, start_station_checkout, on=['start station id', 'start_date_hr'], how='left') end_station_checkout = rides_byStation[[ 'station id', 'date_hour', 'net_checkout' ]] end_station_checkout.columns = [ 'end station id', 'stop_date_hr', 'end_station_net_checkout' ] data = pd.merge(data, end_station_checkout, on=['end station id', 'stop_date_hr'], how='left') feature_visualization = False plt.close('all') if feature_visualization: sns.distplot(data['start station latitude']) sns.distplot(data['start station longitude']) sns.distplot(data.start_area_net_checkout) sns.distplot(data.end_area_net_checkout) sns.distplot(data.start_station_net_checkout) sns.distplot(data.end_station_net_checkout) sns.distplot(data.distance) sns.distplot(data['distance'].apply(lambda x: math.log(x * 100))) sns.distplot(data.speed) # customer feature normalization data_customer_std = pd.DataFrame() data_customer_std['hr_x'] = data['start_hr'].apply( lambda hour: math.sin(2 * math.pi * hour / 24)) data_customer_std['hr_y'] = data['start_hr'].apply( lambda hour: math.cos(2 * math.pi * hour / 24)) col = 'distance' data_customer_std[col] = data[col].apply(lambda x: math.log(x * 100)) # col='start_weekday' # data_customer_std[col]= data[col].apply(lambda x: 1 if x>=6 else 0) data_customer_std['weekday_x'] = data['start_weekday'].apply( lambda day: math.sin(2 * math.pi * day / 7)) data_customer_std['weekday_y'] = data['start_weekday'].apply( lambda day: math.cos(2 * math.pi * day / 7)) col_list = [ 'distance', 'start station latitude', 'start station longitude', 'end station latitude', 'end station longitude', 'start_area_net_checkout', 'end_area_net_checkout', 'start_station_net_checkout', 'end_station_net_checkout' ] data_customer_std.loc[:, col_list] = data[col_list] data_customer_std.fillna(0, inplace=True) for col in data_customer_std.columns: data_customer_std[col] = data_customer_std[col] / np.std( data_customer_std[col]) # sns.violinplot(data=data_customer_std,orient='h') dimension_reduction = False ## dimension reduction for visualization if dimension_reduction: # pca pca_plot = True if pca_plot: pca = PCA() data_customer_pca = pca.fit_transform(data_customer_std) fig = plt.figure(figsize=(12, 8)) plt.scatter(data_customer_pca[:, 0], data_customer_pca[:, 1], s=1) plt.xlabel('pca feature 1') plt.ylabel('pca feature 2') plt.title('pca dimension reduction 2D') # pca.explained_variance_ # pca_components=pd.DataFrame(pca.components_) # pca_components.columns=data_customer.columns # ax = pca_components.plot(kind='bar',stacked=True) # ax.legend(loc=1,fontsize=8) plt.savefig(os.path.join(root_path, r'plots/202008_pca_2D.png')) tsne_plot = False if tsne_plot: # t-SNE tsne = TSNE(random_state=42, n_components=3, verbose=0, perplexity=40, n_iter=400).fit_transform(data_customer_std) # 2D fig = plt.figure(figsize=(12, 8)) plt.scatter(tsne[:, 0], tsne[:, 1], s=1) plt.xlabel('tsne feature 1') plt.ylabel('tsne feature 2') plt.title('tSNE dimension reduction 2D') plt.savefig(os.path.join(root_path, r'plots/202008_tsne_2D.png')) # 3D fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot(111, projection='3d') ax.scatter(tsne[:, 0], tsne[:, 1], tsne[:, 2], s=1) ax.set_xlabel('tsne feature 1') ax.set_ylabel('tsne feature 2') ax.set_zlabel('tsne feature 3') plt.title('tSNE dimension reduction 3D') plt.savefig(os.path.join(root_path, r'plots/202008_tsne_3D.png')) plt.close('all') # umap umap_plot = False if umap_plot: fig, ax = plt.subplots(3, 2) fig.set_size_inches(10, 20) for i, n in enumerate([10, 50, 100]): embedding_corr = umap.UMAP( n_neighbors=n, min_dist=0.3, metric='correlation').fit_transform(data_customer_std) ax[i, 0].scatter(embedding_corr[:, 0], embedding_corr[:, 1], edgecolor='none', alpha=0.80, s=10) ax[i, 0].set_xlabel('umap feature 1') ax[i, 0].set_ylabel('umap feature 2') ax[i, 0].set_title( f'umap dimension reduction_corr metrics_{n}_neighbors') embedding_dist = umap.UMAP( n_neighbors=n, min_dist=0.3, metric='euclidean').fit_transform(data_customer_std) ax[i, 1].scatter(embedding_dist[:, 0], embedding_dist[:, 1], edgecolor='none', alpha=0.80, s=10) ax[i, 1].set_xlabel('umap feature 1') ax[i, 1].set_ylabel('umap feature 2') ax[i, 1].set_title( f'umap dimension reduction_euclidean metrics_{n}_neighbors' ) plt.suptitle('umap visualization') plt.savefig( os.path.join(root_path, r'plots/202008_umap_visualization.png')) plt.close('all') clustering = True if clustering: ## clustering # k-means data_customer_std_sample = copy.deepcopy(data_customer_std.loc[::1, :]) num_max = 4 clustering_kmeans = True if clustering_kmeans: start_time = time.process_time() kmeans_labels_agg = {} sil_scores_kmeans_agg = {} ch_scores_kmeans_agg = {} for num in range(2, num_max + 1): kmeans = KMeans(n_clusters=num, random_state=0) kmeans_labels_agg[num] = kmeans.fit_predict( data_customer_std_sample) sil_scores_kmeans_agg[num] = metrics.silhouette_score( data_customer_std_sample, kmeans_labels_agg[num]) ch_scores_kmeans_agg[num] = metrics.calinski_harabasz_score( data_customer_std_sample, kmeans_labels_agg[num]) # pd.DataFrame.from_dict(sil_scores_kmeans_agg.values()).plot() clustering_hierachy = True if clustering_hierachy: start_time = time.process_time() ward_labels_agg = {} sil_scores_ward_agg = {} ch_scores_ward_agg = {} for num in range(2, num_max + 1): ward_clustering = AgglomerativeClustering( n_clusters=num, linkage='ward').fit(data_customer_std_sample) ward_labels_agg[num] = ward_clustering.labels_ sil_scores_ward_agg[num] = metrics.silhouette_score( data_customer_std_sample, ward_labels_agg[num]) ch_scores_ward_agg[num] = metrics.calinski_harabasz_score( data_customer_std_sample, ward_labels_agg[num]) print( f'ward clustering takes time {time.process_time() - start_time}' ) # pd.DataFrame.from_dict(sil_scores_ward_agg.values()).plot() clustering_gmm = True if clustering_gmm: start_time = time.process_time() gmm_labels_agg = {} sil_scores_gmm_agg = {} ch_scores_gmm_agg = {} for num in range(2, num_max + 1): gmm_clustering = GaussianMixture( n_components=num).fit(data_customer_std_sample) gmm_labels_agg[num] = gmm_clustering.predict( data_customer_std_sample) sil_scores_gmm_agg[num] = metrics.silhouette_score( data_customer_std_sample, gmm_labels_agg[num]) ch_scores_gmm_agg[num] = metrics.calinski_harabasz_score( data_customer_std_sample, gmm_labels_agg[num]) print( f'gmm clustering takes time {time.process_time() - start_time}' ) umap_clustering = True if umap_clustering: embedding_corr = umap.UMAP( n_neighbors=10, min_dist=0.3, metric='correlation').fit_transform(data_customer_std_sample) start_time = time.process_time() kmeans_labels_umap = {} sil_scores_kmeans_umap = {} ch_scores_kmeans_umap = {} for num in range(2, num_max + 1): kmeans = KMeans(n_clusters=num, random_state=0) kmeans_labels_umap[num] = kmeans.fit_predict(embedding_corr) sil_scores_kmeans_umap[num] = metrics.silhouette_score( data_customer_std_sample, kmeans_labels_umap[num]) ch_scores_kmeans_umap[num] = metrics.calinski_harabasz_score( data_customer_std_sample, kmeans_labels_umap[num]) start_time = time.process_time() ward_labels_umap = {} sil_scores_ward_umap = {} ch_scores_ward_umap = {} for num in range(2, num_max + 1): ward_clustering = AgglomerativeClustering( n_clusters=num, linkage='ward').fit(embedding_corr) ward_labels_umap[num] = ward_clustering.labels_ sil_scores_ward_umap[num] = metrics.silhouette_score( data_customer_std_sample, ward_labels_umap[num]) ch_scores_ward_umap[num] = metrics.calinski_harabasz_score( data_customer_std_sample, ward_labels_umap[num]) print( f'ward clustering takes time {time.process_time() - start_time}' ) start_time = time.process_time() gmm_labels_umap = {} sil_scores_gmm_umap = {} ch_scores_gmm_umap = {} for num in range(2, num_max + 1): gmm_clustering = GaussianMixture( n_components=3).fit(embedding_corr) gmm_labels_umap[num] = gmm_clustering.predict(embedding_corr) sil_scores_gmm_umap[num] = metrics.silhouette_score( data_customer_std_sample, gmm_labels_umap[num]) ch_scores_gmm_umap[num] = metrics.calinski_harabasz_score( data_customer_std_sample, gmm_labels_umap[num]) print( f'gmm clustering takes time {time.process_time() - start_time}' ) plot_hierachy_linkage = False if plot_hierachy_linkage: ward_clustering_full = AgglomerativeClustering( distance_threshold=0, n_clusters=None).fit(data_customer_std_sample) linkage = hierarchy.linkage(ward_clustering_full.children_, 'ward') plt.figure(figsize=(10, 7)) dn = hierarchy.dendrogram(linkage) # plot_dendrogram(ward_clustering_full,truncate_mode='level', p=3) plot_clustering_2D = False if plot_clustering_2D: embedding_corr = umap.UMAP( n_neighbors=10, min_dist=0.3, metric='correlation').fit_transform(data_customer_std_sample) # labels=ward_labels_agg[4] labels = ward_labels_umap[2] # visualize clustering fig = plt.figure(figsize=(12, 8)) plt.scatter(embedding_corr[:, 0], embedding_corr[:, 1], edgecolor='none', alpha=0.80, s=10, c=labels) plt.xlabel('umap feature 1') plt.ylabel('umap feature 2') # plt.title(f'umap visualization with kmeans clustering labelling') # plt.savefig(os.path.join(root_path,r'plots/202008_umap_visualization_kmeans_clustering.png')) plt.title( f'umap visualization with ward hierachy clustering labelling') plt.savefig( os.path.join( root_path, r'plots/202008_umap_visualization_ward_clustering.png')) plot_clustering_feature_detail = True if plot_clustering_feature_detail: # analyze feature importance labels_dict = { 0: kmeans_labels_agg, 1: ward_labels_agg, 2: gmm_labels_agg, 3: kmeans_labels_umap, 4: ward_labels_umap, 5: gmm_labels_umap } labels_str_dict = { 0: 'kmeans', 1: 'ward', 2: 'gmm', 3: 'kmeans_umap', 4: 'ward_umap', 5: 'gmm_umap' } for type in range(0, 3): for cluster_num in range(2, num_max + 1): # cluster_num=4 col_select = [ 'start station longitude', 'start station latitude', 'end station latitude', 'end station longitude', 'start_hr', 'start_weekday', 'start_area_net_checkout', 'end_area_net_checkout', 'start_station_net_checkout', 'end_station_net_checkout' ] # fig, ax = plt.subplots(len(col_select), cluster_num) # fig.set_size_inches(5 * cluster_num, 20) # plt.suptitle('clustering feature analysis') # plt.tight_layout() # # labels = labels_dict[type][cluster_num] # df_customer_cluster = {} # # # for cluster_i in range(0, cluster_num): # print(f'analyze cluster {cluster_i}') # mask_i = np.argwhere(labels == cluster_i).ravel() # mask_i_original = data_customer_std_sample.iloc[mask_i].index # df_customer_cluster[cluster_i] = data.loc[mask_i_original].copy() # for i, col in enumerate(col_select): # ax[i, cluster_i] = sns.histplot(ax=ax[i, cluster_i], # data=df_customer_cluster[cluster_i][col], kde=True) # # plt.savefig(os.path.join(root_path, r'plots', # f'202008_clustering feature analysis_{labels_str_dict[type]}_{cluster_num}.png')) labels = labels_dict[type][cluster_num] df_customer_cluster = {} gs_kw = dict(height_ratios=[1.5, 4, 4, 2, 2, 1.5]) fig, ax = plt.subplots(6, cluster_num, constrained_layout=True, gridspec_kw=gs_kw, figsize=(8 * cluster_num, 30)) #plt.tight_layout(pad=8) for cluster_i in range(0, cluster_num): print( f'analyze cluster {cluster_i} of {labels_str_dict[type]}' ) mask_i = np.argwhere(labels == cluster_i).ravel() mask_i_original = data_customer_std_sample.iloc[ mask_i].index df_customer_cluster[cluster_i] = copy.deepcopy( data.loc[mask_i_original]) df_customer_cluster[cluster_i][ 'start area'] = df_customer_cluster[cluster_i][ 'start area'].apply(lambda x: x.split('-')[0]) df_customer_cluster[cluster_i][ 'end area'] = df_customer_cluster[cluster_i][ 'end area'].apply(lambda x: x.split('-')[0]) #bar plot starting area ###question: appearance of the rides in general: by boro df1 = pd.DataFrame( (df_customer_cluster[cluster_i] ['start boro'].value_counts())).sort_values( by='start boro', ascending=False) df2 = pd.DataFrame( (df_customer_cluster[cluster_i] ['end boro'].value_counts())).sort_values( by='end boro', ascending=False) df_boro = pd.merge(df1, df2, how='outer', left_index=True, right_index=True) df_boro.plot(kind='bar', ax=ax[0, cluster_i]) ax[0, cluster_i].title.set_text( f'Rides occurrence by borough in cluster {cluster_i+1}' ) ##appearance of the rides by area df_weekday = df_customer_cluster[cluster_i][ df_customer_cluster[cluster_i] ['start_weekday'].isin([1, 2, 3, 4, 5])] df1 = pd.DataFrame( (df_weekday['start area'].value_counts() )).sort_values(by='start area', ascending=False) df2 = pd.DataFrame( (df_weekday['end area'].value_counts() )).sort_values(by='end area', ascending=False) df_area = pd.merge(df1, df2, how='outer', left_index=True, right_index=True) df_area['start area'].fillna(0, inplace=True) df_area['end area'].fillna(0, inplace=True) df_area.sort_values('start area', ascending=True, inplace=True) df_area.plot(kind='barh', ax=ax[1, cluster_i]) ax[1, cluster_i].tick_params(labelsize=8) ax[1, cluster_i].title.set_text( f'Rides occurrence by area (weekday) in cluster {cluster_i+1}' ) #weekend df_weekend = df_customer_cluster[cluster_i][ df_customer_cluster[cluster_i] ['start_weekday'].isin([6, 7])] df1 = pd.DataFrame( (df_weekend['start area'].value_counts() )).sort_values(by='start area', ascending=False) df2 = pd.DataFrame( (df_weekend['end area'].value_counts() )).sort_values(by='end area', ascending=False) df_area = pd.merge(df1, df2, how='outer', left_index=True, right_index=True) df_area['start area'].fillna(0, inplace=True) df_area['end area'].fillna(0, inplace=True) df_area.sort_values('start area', ascending=True, inplace=True) df_area.plot(kind='barh', ax=ax[2, cluster_i]) ax[2, cluster_i].tick_params(labelsize=8) ax[2, cluster_i].title.set_text( f'Rides occurrence by area (weekend) in cluster {cluster_i+1}' ) ##appearance of the rides by hour df_start_time_raw = df_customer_cluster[cluster_i][[ 'start_weekday', 'start_hr' ]].groupby(['start_weekday', 'start_hr']).size() df_start_time = df_start_time_raw.reset_index() df_start_time.columns = [ 'ride_day', 'ride_hr', 'count' ] #weekay rides df_start_time_weekday = df_start_time.loc[ df_start_time['ride_day'].isin([1, 2, 3, 4, 5])] sns.barplot(data=df_start_time_weekday, x="ride_hr", y='count', hue='ride_day', palette='husl', ax=ax[3, cluster_i]) ax[3, cluster_i].title.set_text( f'Rides occurrence by hour (weekday) in cluster {cluster_i+1}' ) #weekend rides df_start_time_weekend = df_start_time.loc[ df_start_time['ride_day'].isin([6, 7])] sns.barplot(data=df_start_time_weekend, x="ride_hr", y='count', hue='ride_day', palette='husl', ax=ax[4, cluster_i]) ax[4, cluster_i].title.set_text( f'Rides occurrence by hour (weekend) in cluster {cluster_i+1}' ) df_customer_cluster[cluster_i].usertype.value_counts( ).plot(kind='bar', ax=ax[5, cluster_i]) ax[5, cluster_i].title.set_text( f'Rides occurrence by usertype in cluster {cluster_i + 1}' ) # ax=sns.histplot(df_start_time, x="ride_hr",binwidth=1, y='count', hue='ride_day',palette="pastel",unstack) # sns.histplot(df_start_time, x="ride_hr",binwidth=1, y='count',hue='ride_day',ax=ax[2, cluster_i]) # ax[2, cluster_i].title.set_text(f'Day/Time of the rides in cluster {cluster_i+1}') # # #bar plot start and end area demand comparison # df_val = df_customer_cluster[cluster_i][ # ['start_area_net_checkout', 'end_area_net_checkout']].groupby( # ['start_area_net_checkout', 'end_area_net_checkout']).size() # df_checkout = df_customer_cluster[cluster_i][ # ['start_area_net_checkout', 'end_area_net_checkout']].copy() # df_checkout.dropna(inplace=True) # df_checkout['val'] = df_checkout.apply(lambda x: df_val[x.iloc[0]][x.iloc[1]], axis=1) # df_checkout.plot.scatter( # x='start_area_net_checkout', y='end_area_net_checkout', s='val', ax=ax[6, cluster_i]) # ax[6, cluster_i].title.set_text(f'Net Checkouts Comparison of Start & End Area in {cluster_i+1}') # plt.setp(ax[0, cluster_i].yaxis.get_majorticklabels(), fontsize=8) # plt.setp(ax[1, cluster_i].yaxis.get_majorticklabels(), fontsize=8) # plt.setp(ax[2, cluster_i].yaxis.get_majorticklabels(), fontsize=8) plt.savefig( os.path.join( root_path, r'plots', f'202008_{labels_str_dict[type]}_{cluster_num}_cluster_feature_detail.png' )) plt.close('all') for type in range(0, 3): for cluster_num in range(2, num_max + 1): # geoplot # type = 2 # cluster_num = 3 labels = labels_dict[type][cluster_num] df_customer_cluster = {} fig, ax = plt.subplots(cluster_num, 2) fig.set_size_inches(15, 7 * cluster_num) plt.tight_layout(pad=5) for cluster_i in range(0, cluster_num): print(f'analyze cluster {cluster_i}') mask_i = np.argwhere(labels == cluster_i).ravel() mask_i_original = data_customer_std_sample.iloc[ mask_i].index df_customer_cluster[cluster_i] = data.loc[ mask_i_original].copy() # df_customer_cluster[cluster_i] = pd.merge(df_customer_cluster[cluster_i], # area_concat('start', station_profile_gis[ # ['station id', 'geometry']]), # how='left', on='start station id') # df_customer_cluster[cluster_i] = pd.merge(df_customer_cluster[cluster_i], # area_concat('end', station_profile_gis[ # ['station id', 'geometry']]), # how='left', on='end station id') #df_customer_cluster[cluster_i]['weekend_flag']=df_customer_cluster[cluster_i]['start_weekday'].apply(lambda x: 1 if x>=6 else 0) # counter_start = df_customer_cluster[cluster_i]['start area'].value_counts() # counter_end = df_customer_cluster[cluster_i]['end area'].value_counts() # df_customer_cluster[cluster_i]['start_area_net_checkout_median'] = df_customer_cluster[ # cluster_i]['start area'].apply(lambda x: counter_start[x]) # df_customer_cluster[cluster_i]['end_area_net_checkout_median'] = df_customer_cluster[ # cluster_i]['end area'].apply(lambda x: counter_end[x]) start_avg = df_customer_cluster[cluster_i][[ 'start area', 'start_area_net_checkout' ]].groupby(['start area']).mean() start_avg_gis = gpd.GeoDataFrame( pd.merge(start_avg.reset_index(), map[['boro', 'area', 'geometry']], how='inner', left_on='start area', right_on='area'), geometry='geometry', crs={ 'init': 'epsg:4326', 'no_defs': True }) map.plot(ax=ax[cluster_i, 0], figsize=(8, 8), alpha=0.5, edgecolor='k') start_avg_gis.plot(column='start_area_net_checkout', ax=ax[cluster_i, 0], legend=True) end_avg = df_customer_cluster[cluster_i][[ 'end area', 'end_area_net_checkout' ]].groupby(['end area']).mean() end_avg_gis = gpd.GeoDataFrame(pd.merge( end_avg.reset_index(), map[['boro', 'area', 'geometry']], how='inner', left_on='end area', right_on='area'), geometry='geometry', crs={ 'init': 'epsg:4326', 'no_defs': True }) map.plot(ax=ax[cluster_i, 1], figsize=(8, 8), alpha=0.5, edgecolor='k') end_avg_gis.plot(column='end_area_net_checkout', ax=ax[cluster_i, 1], legend=True) ax[cluster_i, 0].title.set_text( f'Start Station Net Checkouts in cluster {cluster_i + 1}' ) ax[cluster_i, 1].title.set_text( f'End Station Net Checkouts in cluster {cluster_i + 1}' ) plt.savefig( os.path.join( root_path, r'plots', f'202008_{labels_str_dict[type]}_{cluster_num}_station_detail.png' )) plt.close('all') run_classification = True if run_classification: y = gmm_labels_agg[3] from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split( data_customer_std_sample, y, test_size=0.2) from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score gnb = GaussianNB() gnb.fit(x_train, y_train) y_predict = gnb.predict(x_test) y_combo = list(zip(y_test, y_predict)) accuracy_score(y_test, y_predict)
f, ax = plt.subplots(1) # Plot polygons in light grey gpd.plotting.plot_polygon_collection(ax, bgm['geometry'], facecolor='grey', alpha=0.25, linewidth=0.1) gpd.plotting.plot_polygon_collection(ax, gdfpol['geometry'], facecolor=None, edgecolor='green', linewidth=0.1) f geoplot.polyplot(f, ax=ax) ax = geoplot.kdeplot(gdfpts_sub, shade=True, shade_lowest=False, cmap="coolwarm", clip=bgm.geometry) geoplot.polyplot(bgm, ax=ax) ax = geoplot.pointplot(gdfpts_sub) geoplot.polyplot(bgm, ax=ax) #optionally write away info df['properties.title'].to_csv('NLD_S2-L2_st31_190626.csv', index=False)
gplt.datasets.get_path('nyc_injurious_collisions')) fig = plt.figure(figsize=(10, 5)) proj = projection = gcrs.AlbersEqualArea(central_latitude=40.7128, central_longitude=-74.0059) ax1 = plt.subplot(121, projection=proj) ax2 = plt.subplot(122, projection=proj) gplt.polyplot(nyc_boroughs, ax=ax1, projection=proj) gplt.pointplot(nyc_fatal_collisions, projection=proj, hue='BOROUGH', cmap='Set1', edgecolor='white', linewidth=0.5, scale='NUMBER OF PERSONS KILLED', limits=(2, 8), legend=True, legend_var='scale', legend_kwargs={'loc': 'upper left'}, legend_values=[2, 1], legend_labels=['2 Fatalities', '1 Fatality'], ax=ax1) ax1.set_title("Fatal Crashes in New York City, 2016") gplt.polyplot(nyc_boroughs, ax=ax2, projection=proj) gplt.pointplot(nyc_injurious_collisions, projection=proj, hue='BOROUGH', cmap='Set1', edgecolor='white', linewidth=0.5,
import geopandas as gpd import geoplot import matplotlib.pyplot as plt import contextily as ctx df_quakes = gpd.read_file("lastday.json") df_quakes = df_quakes[df_quakes["mag"] != "-"] df_quakes["mag_num"] = df_quakes["mag"].astype(float) df_quakes = df_quakes[df_quakes.mag_num > 0] extent = (950000, 2000000, 5800000, 6300000) df_quakes.to_crs(epsg=3857) ax = geoplot.pointplot(df_quakes, color="red", scale="mag_num", limits=(0.5, 1.5)) ax.axis(extent) ctx.add_basemap(ax, source=ctx.providers.Stamen.TonerLite, zoom=6) plt.show() #source=ctx.providers.BasemapAT.grau
""" KDEPlot of Boston AirBnB Locations ================================== This example demonstrates a combined application of ``kdeplot`` and ``pointplot`` to a dataset of AirBnB locations in Boston. The result is outputted to a webmap using the nifty ``mplleaflet`` library. We sample just 1000 points, which captures the overall trend without overwhelming the renderer. `Click here to see this plot as an interactive webmap. <http://bl.ocks.org/ResidentMario/868ac097d671df1ed5ec83eed048560c>`_ """ import geopandas as gpd import geoplot as gplt import geoplot.crs as gcrs import matplotlib.pyplot as plt import mplleaflet boston_airbnb_listings = gpd.read_file( gplt.datasets.get_path('boston_airbnb_listings')) ax = gplt.kdeplot(boston_airbnb_listings, cmap='Greens') gplt.pointplot(boston_airbnb_listings.sample(1000), color='darkgreen', ax=ax) fig = plt.gcf() plt.savefig("boston-airbnb-kde.png", bbox_inches='tight', pad_inches=0.1) # mplleaflet.show(fig)
""" Many applications involve working with geographical data. For example, when tracking global weather, we might want to plot the temperature as measured by various sensors around the world at their position on a map. For this, we can use the GeoPandas package and the Geoplot package, both of which allow us to manipulate, analyze, and visualize geographical data. This module illustrates how to use GeoPandas and Geoplot packages to load and visualize some sample geographical data. """ import geoplot import geopandas import matplotlib.pyplot as plt world = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) cities = geopandas.read_file( geopandas.datasets.get_path("naturalearth_cities")) fig, ax = plt.subplots() geoplot.polyplot(world, ax=ax) geoplot.pointplot(cities, ax=ax, fc="r", marker="2") ax.axis((-180, 180, -90, 90)) plt.show()
import geopandas as gpd from quilt.data.ResidentMario import geoplot_data continental_cities = gpd.read_file(geoplot_data.usa_cities()).query('POP_2010 > 100000') continental_usa = gpd.read_file(geoplot_data.contiguous_usa()) # Plot the figure. import geoplot as gplt import geoplot.crs as gcrs import matplotlib.pyplot as plt poly_kwargs = {'linewidth': 0.5, 'edgecolor': 'gray', 'zorder': -1} point_kwargs = {'linewidth': 0.5, 'edgecolor': 'black', 'alpha': 1} legend_kwargs = {'bbox_to_anchor': (0.9, 0.9), 'frameon': False} ax = gplt.polyplot(continental_usa, projection=gcrs.AlbersEqualArea(central_longitude=-98, central_latitude=39.5), **poly_kwargs) gplt.pointplot(continental_cities, projection=gcrs.AlbersEqualArea(), ax=ax, scale='POP_2010', limits=(1, 80), hue='POP_2010', cmap='Blues', legend=True, legend_var='scale', legend_values=[8000000, 6000000, 4000000, 2000000, 100000], legend_labels=['8 million', '6 million', '4 million', '2 million', '100 thousand'], legend_kwargs=legend_kwargs, **point_kwargs) plt.title("Large cities in the contiguous United States, 2010") plt.savefig("largest-cities-usa.png", bbox_inches='tight', pad_inches=0.1)
""" KDEPlot of Boston AirBnB Locations ================================== This example demonstrates a combined application of ``kdeplot`` and ``pointplot`` to a dataset of AirBnB locations in Boston. The result is outputted to a webmap using the nifty ``mplleaflet`` library. We sample just 1000 points, which captures the overall trend without overwhelming the renderer. `Click here to see this plot as an interactive webmap. <https://bl.ocks.org/ResidentMario/868ac097d671df1ed5ec83eed048560c>`_ """ import geopandas as gpd import geoplot as gplt import geoplot.crs as gcrs import matplotlib.pyplot as plt boston_airbnb_listings = gpd.read_file( gplt.datasets.get_path('boston_airbnb_listings')) ax = gplt.kdeplot(boston_airbnb_listings, cmap='viridis', projection=gcrs.WebMercator(), figsize=(12, 12), shade=True) gplt.pointplot(boston_airbnb_listings, s=1, color='black', ax=ax) gplt.webmap(boston_airbnb_listings, ax=ax) plt.title('Boston AirBnB Locations, 2016', fontsize=18)
contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) scheme = mc.Quantiles(continental_usa_cities['POP_2010'], k=5) ax = gplt.polyplot( contiguous_usa, zorder=-1, linewidth=1, projection=gcrs.AlbersEqualArea(), edgecolor='white', facecolor='lightgray', figsize=(8, 12) ) gplt.pointplot( continental_usa_cities, scale='POP_2010', limits=(2, 30), hue='POP_2010', cmap='Blues', scheme=scheme, legend=True, legend_var='scale', legend_values=[8000000, 2000000, 1000000, 100000], legend_labels=['8 million', '2 million', '1 million', '100 thousand'], legend_kwargs={'frameon': False, 'loc': 'lower right'}, ax=ax ) plt.title("Large cities in the contiguous United States, 2010") plt.savefig("largest-cities-usa.png", bbox_inches='tight', pad_inches=0.1) plt.show()
""" This script tests whether the current environment works correctly or not. """ import sys sys.path.insert(0, '../') import geoplot as gplt from geoplot import crs as gcrs import geopandas as gpd # cf. https://github.com/Toblerity/Shapely/issues/435 # Fiona/Shapely/Geopandas test. cities = gpd.read_file("../data/cities/citiesx010g.shp") census_tracts = gpd.read_file( "../data/nyc_census_tracts/census_tracts_2010.geojson") # Cartopy test. gplt.pointplot(cities.head(50), extent=(10, 20, 10, 20))
injurious_collisions = gpd.read_file(geoplot_data.nyc_injurious_collisions()) # Plot the data. import geoplot as gplt import geoplot.crs as gcrs import matplotlib.pyplot as plt fig = plt.figure(figsize=(10,5)) ax1 = plt.subplot(121, projection=gcrs.AlbersEqualArea(central_latitude=40.7128, central_longitude=-74.0059)) gplt.polyplot(boroughs, ax=ax1, projection=gcrs.AlbersEqualArea()) gplt.pointplot(fatal_collisions, projection=gcrs.AlbersEqualArea(), hue='BOROUGH', categorical=True, edgecolor='white', linewidth=0.5, zorder=10, scale='NUMBER OF PERSONS KILLED', limits=(2, 8), legend=True, legend_var='scale', legend_kwargs={'loc': 'upper left'}, legend_values=[2, 1], legend_labels=['2 Fatalities', '1 Fatality'], ax=ax1) plt.title("Fatal Crashes in New York City, 2016") ax2 = plt.subplot(122, projection=gcrs.AlbersEqualArea(central_latitude=40.7128, central_longitude=-74.0059)) gplt.polyplot(boroughs, ax=ax2, projection=gcrs.AlbersEqualArea()) gplt.pointplot(injurious_collisions, projection=gcrs.AlbersEqualArea(), hue='BOROUGH', categorical=True, edgecolor='white', linewidth=0.5, zorder=10, scale='NUMBER OF PERSONS INJURED', limits=(1, 10), legend=True, legend_var='scale', legend_kwargs={'loc': 'upper left'}, legend_values=[20, 15, 10, 5, 1], legend_labels=['20 Injuries', '15 Injuries', '10 Injuries', '5 Injuries', '1 Injury'], ax=ax2)
sep=",", header=0) HUC8 = geopandas.GeoDataFrame.from_file('watershed/huc8sum.shp') HUC8.crs = {'init': 'epsg:4326'} #Ecoregions Ecoregions = geopandas.GeoDataFrame.from_file( 'NARS_NP_values/narswsa_20110504.shp') Ecoregions.crs = {'init': 'epsg:4326'} #NARS points proj = gcrs.AlbersEqualArea() ax = gplt.polyplot(HUC8, projection=proj) gplt.pointplot(gdf_NARS_site_info_UIDindex, ax=ax, projection=proj, s=1, color='red') plt.savefig('NARS_locations.pdf') plt.savefig("NARS_locations.png", bbox_inches='tight', pad_inches=0.1) #gdf_NARS_site_info_UIDindex_renamed = gdf_NARS_site_info_UIDindex.rename(columns={'HUC8':'HUC_8'}) # ============================================================================= # NARS (rivers and streams) # ============================================================================= # UID_HUC8_dict UID_HUC8_dict = { key: NARS_site_info['HUC8'][NARS_site_info.UID == key].values for key in NARS_site_info['UID'].values
y_pred = dt.predict(lakeData5) classification = pd.DataFrame(data=y_pred, columns=['intermittent']) classifiedData = glakeData.copy() classifiedData['intermittent'] = classification['intermittent'].values intermittentLakes5 = classifiedData[(classifiedData['intermittent'] == 1)] print(intermittentLakes5.shape) #Plot current intermittent and annual lakes ax = gplt.polyplot(world, projection=gplt.crs.NorthPolarStereo(), facecolor='whitesmoke', figsize=(15, 15)) gplt.pointplot(annualLakes, color='black', ax=ax, s=0.5, label='Annual winter ice') gplt.pointplot(intermittentLakes, color='tab:orange', ax=ax, s=0.5, label='Intermittent winter ice') lgnd = plt.legend(loc="lower left", scatterpoints=1, fontsize=18) lgnd.legendHandles[0]._sizes = [100] lgnd.legendHandles[1]._sizes = [100] plt.savefig('currentLakeMapSharmaMinGood.png', bbox_inches='tight') plt.clf() #Plot warming scenarioes ax = gplt.polyplot(world,
hue='avg_d_kbps', projection=gcrs.AlbersEqualArea(), cmap='Greens', legend=True, ax=ax) plt.show() # In[43]: # use the location of the centroid of each polygon data_2020['geometry'] = data_2020['geometry'].centroid # In[56]: ax = gplt.webmap(data_2020, projection=gcrs.WebMercator()) gplt.pointplot(data_2020, ax=ax, hue='avg_d_kbps', legend=True) plt.show() # In[53]: ax = gplt.webmap(data_2020, projection=gcrs.WebMercator()) gplt.kdeplot(data_2020[['avg_d_kbps', 'geometry']], n_levels=50, cmap='Reds', thresh=0.05, shade=True, ax=ax) plt.show() # In[32]:
'linewidth': 0 } pointplot_kwargs = { 'projection': gcrs.AlbersEqualArea(), 'scale': 'ELEV_IN_FT', 'edgecolor': 'white', 'linewidth': 0.5, 'color': 'black' } ylim = (-1647757.3894385984, 1457718.4893930717) # Our first plot is a default linear-scale one. We can see from the results that this is clearly the most appropriate # one for this specific data. gplt.polyplot(contiguous_usa.geometry, ax=axarr[0][0], **polyplot_kwargs) gplt.pointplot(cities.query("POP_2010 > 10000"), ax=axarr[0][0], limits=(0.1, 10), **pointplot_kwargs) axarr[0][0].set_title("Linear Scale") axarr[0][0].set_ylim(ylim) # Next, a trivial identity scale. This results in a plot where every city has the same size. def identity_scale(minval, maxval): def scalar(val): return 2 return scalar gplt.polyplot(contiguous_usa.geometry, ax=axarr[0][1], **polyplot_kwargs) gplt.pointplot(cities.query("POP_2010 > 10000"),
polyplot_kwargs = { 'projection': gcrs.AlbersEqualArea(), 'facecolor': (0.9, 0.9, 0.9), 'zorder': -100, 'linewidth': 0 } pointplot_kwargs = { 'projection': gcrs.AlbersEqualArea(), 'scale': 'ELEV_IN_FT', 'edgecolor': 'white', 'linewidth': 0.5, 'color': 'black' } ylim = (-1647757.3894385984, 1457718.4893930717) # Our first plot is a default linear-scale one. We can see from the results that this is clearly the most appropriate # one for this specific data. gplt.polyplot(gpd.GeoSeries(continental_usa), ax=axarr[0][0], **polyplot_kwargs) gplt.pointplot(cities.query("POP_2010 > 10000"), ax=axarr[0][0], limits=(0.1, 10), **pointplot_kwargs) axarr[0][0].set_title("Linear Scale") axarr[0][0].set_ylim(ylim) # Next, a trivial identity scale. This results in a plot where every city has the same size. def identity_scale(minval, maxval): def scalar(val): return 2 return scalar gplt.polyplot(gpd.GeoSeries(continental_usa), ax=axarr[0][1], **polyplot_kwargs) gplt.pointplot(cities.query("POP_2010 > 10000"), ax=axarr[0][1], scale_func=identity_scale, **pointplot_kwargs) axarr[0][1].set_title("Identity Scale") axarr[0][1].set_ylim(ylim)