def dbscan_reduce_vot(df, x='x', y='y'): start_time = time.time() # matrix of np arrays coords = df[['y', 'x']].values db = (DBSCAN(**yml['dbscan_vot']['params']).fit(coords)) cluster_labels = db.labels_ num_clusters = len(set(cluster_labels)) clusters = pd.Series( [coords[cluster_labels == n] for n in range(num_clusters)]) # find point in each cluster closest to its centroid centermost_points = clusters.map(get_centroid) # unzip list of centermost points (lat, lon) lats, lons = zip(*centermost_points) rep_points = pd.DataFrame({x: lons, y: lats}) rs = rep_points.apply(lambda row: df[ (df[y] == row[y]) & (df[x] == row[x])].iloc[0], axis=1) rs = gpd.GeoDataFrame(rs, geometry='geometry', crs=yml['crs']['crs']) logger.info( "Clustered {:,} verblijfsobjecten down to {:,} vot_clusters, for {:.2f}% compression in {:,.2f} sec." .format(len(df), len(rs), 100 * (1 - float(len(rs)) / len(df)), time.time() - start_time)) return rs
def get_stag_table65(): df = pd.read_csv(yml['path']['data_stag_tables'] + yml['file_stag_tables']['clusters65'], dtype=str) df['geometry'] = df['geometry'].apply(lambda x: wkt.loads(x)) df['pnd_geom'] = df['pnd_geom'].apply(lambda x: wkt.loads(x)) df = gpd.GeoDataFrame(df, geometry='geometry', crs=yml['crs']['crs']) logger.info("GeoDataFrame has shape: {} and crs: {}".format(df.shape, crs)) return df
def create_distance_matrix_afval(df1, df2, fractie=str, buffer=int, include_nearest_point=None, n=int): """ calculate distance matrix frames. See ../helper_functions/distance_matrix.py Make sure the you feed geoPandas df with a geometry column args: df1 : dataframe one containing geometry column (points) df2 : dataframe with clustered afvalcontainers containing geometry column (points) buffer: buffer in meters around the geometry column in df2 include_nearest_point: Find nearest point,return corresponding value from specified column.Caution very slow on big sets. n : number of iterations. set n=len(df2) to loop through full set """ df2 = df2[df2['fractie'] == fractie].reset_index() df2['buffer'] = df2['geometry'].buffer(buffer) n = n logger.info('Building dm (fractie: {}, buffer: {} with {} iterations' \ .format(fractie, buffer, n)) stag_distance = [] for i, row in enumerate(tqdm_notebook(list(df2['buffer'][:n]))): sub_df = df1.loc[(df1.geometry.within(df2['buffer'][i])), :] sub_df = (sub_df.apply(calculate_distance, dest_geom=df2['geometry'][i], target_col='distance', axis=1)) print('Shape sub_df {} = {}'.format(i, sub_df.shape)) if include_nearest_point: indices = (sub_df.apply(find_nearest_point, geom_union=df2.unary_union, df1=sub_df, df2=df2, geom1_col='geometry', src_column='container_id', axis=1)) indices_frame = indices.to_frame() sub_df = pd.concat([sub_df, indices_frame], axis=1) stag_distance.append(sub_df) stag_distance.append(sub_df) return stag_distance
def get_distance_matrices(path, file): """ load in the munged distance matrices resulting from function distance_matrix/deduplicate_distance_matrix_.. """ df = pd.read_csv(path + file) geo_cols = ['geometry', 'geom_point'] for col in df[geo_cols]: df[col] = df[col].apply(lambda x: wkt.loads(x)) df = df.drop('_merge', axis=1) df = gpd.GeoDataFrame(df, geometry='geometry', crs=yml['crs']['crs']) logger.info("GeoDataFrame has shape: {} and crs: {}".format(df.shape, crs)) return df
def get_available_layers_from_wfs(url_wfs): """ Get all layer names in WFS service, print and return them in a list. """ layer_names = [] parameters = { "REQUEST": "GetCapabilities", "SERVICE": "WFS" } getcapabilities = requests.get(url_wfs, params=parameters) root = ET.fromstring(getcapabilities.text) for neighbor in root.iter('{http://www.opengis.net/wfs/2.0}FeatureType'): logger.info("layername: " + neighbor[1].text) layer_names.append(neighbor[1].text) return layer_names
def get_df2(path, file, plot=bool): """ load dataframe with the second poi geometries currently choice from ah, oba """ df = pd.read_csv(path + file, dtype=str) logger.info("Loading {} GeoDataFrame, with shape: {} and crs: {}".format( file, df.shape, crs)) if df.columns.str.contains('geom').any(): df = df.rename(columns={'geom': 'geometry'}) df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x, hex=True)) df = gpd.GeoDataFrame(df, crs=crs, geometry='geometry') # join stadsdelen on poi2 std = wfs.get_sd_layer() df = gpd.sjoin(df, std, how='inner', op='intersects') logger.info( "Spatial join of {} GeoDataFrame and Amsterdam district layer. \ Added columns : {}".format(file, std.columns.tolist())) if file == yml['file']['ah']: buffer = 1000 df['buffer'] = df['geometry'].buffer(buffer) logger.info("created {} meter buffer around {} geometry".format( buffer, df.geometry.geom_type[0])) else: buffer = 2000 df['buffer'] = df['geometry'].buffer(buffer) logger.info("created {} meter buffer around {} geometry".format( buffer, df.geometry.geom_type[0])) if plot: fig, ax = plt.subplots(figsize=[15, 7]) ax = std.plot(ax=ax) df.plot(ax=ax, color='red', alpha=.5, marker='*') gpd.GeoSeries(df.geometry.buffer(buffer)).plot(ax=ax, color='yellow', alpha=.085) return df.reset_index(drop=True)
def get_layer_from_wfs(url_wfs, layer_name, crs, outputformat, retry_count=3): """ Get layer from a wfs service. Args: 1. url_wfs: full url of the WFS including https, excluding /?:: https://map.data.amsterdam.nl/maps/gebieden 2. layer_name: Title of the layer:: f.i. stadsdeel 3. crs: coordinate system number, excluding EPSG:: 28992, 4326 4. outputformat: leave empty to return standard GML (Geographic Markup language), otherwise: json, geojson, txt, shapezip Returns: The layer in the specified output format. """ parameters = { "REQUEST": "GetFeature", "TYPENAME": layer_name, "SERVICE": "WFS", "VERSION": "2.0.0", "SRSNAME": "EPSG:{}".format(crs), "OUTPUTFORMAT": outputformat } logger.info("Requesting data from {}, layer: {}".format( url_wfs, layer_name)) retry = 0 # webrequests sometimes fail.. while retry < retry_count: response = requests.get(url_wfs, params=parameters) logger.debug(response) if response.status_code == 400: logger.info("Incorrect layer name: {}, please correct the layer name".format(layer_name)) continue if response.status_code != 200: time.sleep(3) # try again.. retry += 1 else: # status 200. Yeah!. break if outputformat in ('geojson, json'): geojson = response.json() logger.info("{} features returned.".format(str(len(geojson["features"])))) return geojson return response
def get_df1(path, bag_file, add_brp_18=None, add_brp_65=None, plot=None): """ load bag data with or w/o 18/65 additional info args: path = path to the data folder bag_file = bag_file. See yml['file'] for options """ # bag clusters df = pd.read_csv(path + bag_file, dtype=str) logger.info("Loading {} GeoDataFrame, with shape: {} and crs: {}".format( bag_file, df.shape, crs)) df['geometry'] = df['cl_geom'].apply(lambda x: wkb.loads(x, hex=True)) df = gpd.GeoDataFrame(df, crs=yml['crs']['crs'], geometry='geometry') df = df.drop('cl_geom', axis=1) if bag_file == yml['file']['bag_full']: df['pnd_geom'] = df['pnd_geom'].apply(lambda x: wkb.loads(x, hex=True)) if add_brp_18: if bag_file == yml['file']['bag_full']: vot18 = pd.read_csv(yml['path']['data_path_brp'] + yml['file']['vot18'], sep=';', dtype=str) vot18['18'] = 18 df = pd.merge(df, vot18[['lv_bag_vot_id', '18']], left_on=['landelijk_vot_id'], right_on=['lv_bag_vot_id'], how='left', indicator=True) logger.info("Matched {} rows with left_only join".format( df._merge.value_counts()[0])) return df else: print('add_brp_18 not applicable to bag_clusters dataset') if add_brp_65: if bag_file == yml['file']['bag_full']: vot65 = pd.read_csv(yml['path']['data_path_brp'] + yml['file']['vot65'], sep=';', dtype=str) vot65['65'] = 65 df = pd.merge(df, vot65[['lv_bag_vot_id', '65']], left_on=['landelijk_vot_id'], right_on=['lv_bag_vot_id'], how='left', indicator=True) logger.info("Matched {} rows with left_only join".format( df._merge.value_counts()[0])) return df else: print('add_brp_65 not applicable to bag_clusters dataset') if plot: n = 1000 fig, ax = plt.subplots(figsize=[15, 7]) logger.info("Plotting {} POINTS".format(n)) df[:n].plot(ax=ax, color='blue', alpha=.5) return df
def get_afvalcontainers_full_df(column_subset=None): # load geojson params = yml['afvalcontainers']['params'] url = yml['afvalcontainers']['url'] response = requests.get(url=url, params=params) data = response.json() # parse the json, give nice names results = [] for item in data['features']: result_dict = {} result_dict['geometry'] = item['geometry']['coordinates'] result_dict['active'] = item['properties']['active'] result_dict['buurt_code'] = item['properties']['buurt_code'] result_dict['container_type_id'] = item['properties'][ 'container_type_id'] result_dict['container_id'] = item['properties']['id'] result_dict['id_number'] = item['properties']['id_number'] result_dict['operate_date'] = item['properties']['operational_date'] result_dict['owner'] = item['properties']['owner'] result_dict['place_date'] = item['properties']['placing_date'] result_dict['serial_number'] = item['properties']['serial_number'] result_dict['stadsdeel'] = item['properties']['stadsdeel'] result_dict['address'] = item['properties']['text'] result_dict['fractie'] = item['properties']['waste_name'] result_dict['fractie_type'] = item['properties']['waste_type'] results.append(result_dict) df = gpd.GeoDataFrame(results, crs=yml['crs']['crs_4326']) # filter out messy fracties/ waste_types df = df[(df['fractie'].isin(yml['afvalcontainers']['fracties']))] # convert geometry column to Points df['geometry'] = [Point(xy) for xy in df['geometry']] #flatten the 'owner column, merge back on df, drop owner column owner = json_normalize(df['owner']) df = (pd.merge(df, owner, left_index=True, right_index=True).drop(labels=['owner'], axis=1)) if column_subset: keep_cols = ['container_id', 'geometry', 'fractie'] df = df[keep_cols] # to crs 28992 df = df.to_crs(crs=yml['crs']['crs']) # filter an annoying outlier df['x'] = df['geometry'].x df['y'] = df['geometry'].y df = df[((df.x >= 110000) & (df.x <= 135000) & (df.y >= 475000) & (df.y <= 494000))] df = df.reset_index(drop=True) logger.info("index has been reset") logger.info("Afvalcontainers_full df has shape: {} and crs: {}".format( df.shape, crs)) return df
def deduplicate_distance_matrix_general(stag_distance, df1, buffer): """ steps to munge the raw distance matrix into a clean deduplicated version args: stag_distance: frame resulting from create_distance_matrix_afval function df1 : same dataframe one containing geometry column (points) as fed to the create_distance_matrix_afval function """ stag_dm = pd.concat(stag_distance, axis=0, sort=False) stag_dm.distance = stag_dm['distance'].astype(float) logger.info('Raw dm has shape:{}'.format(stag_dm.shape)) dist_agg = stag_dm.groupby(['cluster_toewijzing']).agg(f).reset_index() dist_agg.columns = [ f'{i}_{j}' if j != '' else f'{i}' for i, j in dist_agg.columns ] keep_cols = [ 'landelijk_pnd_id', 'pnd_geom', 'geometry', 'cluster_toewijzing' ] dist_agg = pd.merge(df1[keep_cols], dist_agg, on=['cluster_toewijzing'], how='left', indicator=True) logger.info('Aggregated dm has shape:{}'.format(dist_agg.shape)) logger.info('merge results:\n{}'.format(dist_agg._merge.value_counts())) distance_cols = ['distance_min', 'distance_max', 'distance_mean'] for col in dist_agg[distance_cols]: dist_agg[col] = ( dist_agg[col].astype(float).multiply(1000).fillna(buffer + 1).map( '{:.0f}'.format)) dist_agg[col] = dist_agg[col].astype(int) logger.info('Filled values above the {} buffer boundaries with value: {}'\ .format(buffer, buffer + 1)) # we want to plot on pnd_geom, Geopandas only accepts 'geometry' for plotting, so: dist_agg = dist_agg.rename(columns={ 'geometry': 'geom_point', 'pnd_geom': 'geometry' }) pnd_mean_dist = dist_agg.groupby( ['landelijk_pnd_id'])['distance_min'].mean().to_frame().reset_index() pnd_mean_dist = pnd_mean_dist.rename( columns={'distance_min': 'pnd_dist_mean'}) pnd_mean_dist.pnd_dist_mean = pnd_mean_dist.pnd_dist_mean.map( '{:.0f}'.format) final = pd.merge(dist_agg, pnd_mean_dist, on=['landelijk_pnd_id'], how='left') logger.info('Final dm has shape:{}'.format(final.shape)) logger.info('columns dm {}'.format(final.columns.tolist())) # plot histograms num_cols = ['distance_min', 'distance_max', 'distance_mean'] fig, ax = plt.subplots(len(num_cols), 1, figsize=[9, 6]) print('histogram numerical distance columns: ') for i, col in enumerate(final[num_cols].columns): final[final.distance_min <= buffer][col].dropna().hist(bins=40, ax=ax[i]) ax[i].set_title(col) plt.tight_layout() return final