def test_sjoin_values(self): # GH190 self.polydf.index = [1, 3, 4, 5, 6] df = sjoin(self.pointdf, self.polydf, how='left') self.assertEquals(df.shape, (21,8)) df = sjoin(self.polydf, self.pointdf, how='left') self.assertEquals(df.shape, (12,8))
def test_sjoin_values(self): # GH190 self.polydf.index = [1, 3, 4, 5, 6] df = sjoin(self.pointdf, self.polydf, how='left') assert df.shape == (21, 8) df = sjoin(self.polydf, self.pointdf, how='left') assert df.shape == (12, 8)
def test_sjoin_op(self): # points within polygons df = sjoin(self.pointdf, self.polydf, how="left", op="within") assert df.shape == (21, 8) assert df.loc[1]['BoroName'] == 'Staten Island' # points contain polygons? never happens so we should have nulls df = sjoin(self.pointdf, self.polydf, how="left", op="contains") assert df.shape == (21, 8) assert np.isnan(df.loc[1]['Shape_Area'])
def test_sjoin_right(self): # the inverse of left df = sjoin(self.pointdf, self.polydf, how="right") df2 = sjoin(self.polydf, self.pointdf, how="left") assert df.shape == (12, 8) assert df.shape == df2.shape for i, row in df.iterrows(): assert row.geometry.type == 'MultiPolygon' for i, row in df2.iterrows(): assert row.geometry.type == 'MultiPolygon'
def test_sjoin_right(self): # the inverse of left df = sjoin(self.pointdf, self.polydf, how="right") df2 = sjoin(self.polydf, self.pointdf, how="left") self.assertEquals(df.shape, (12, 8)) self.assertEquals(df.shape, df2.shape) for i, row in df.iterrows(): self.assertEquals(row.geometry.type, 'MultiPolygon') for i, row in df2.iterrows(): self.assertEquals(row.geometry.type, 'MultiPolygon')
def test_sjoin_op(self): # points within polygons df = sjoin(self.pointdf, self.polydf, how="left", op="within") self.assertEquals(df.shape, (21,8)) self.assertEquals(df.ix[1]['BoroName'], 'Staten Island') # points contain polygons? never happens so we should have nulls df = sjoin(self.pointdf, self.polydf, how="left", op="contains") self.assertEquals(df.shape, (21, 8)) self.assertTrue(np.isnan(df.ix[1]['Shape_Area']))
def test_sjoin_invalid_args(self, dfs): index, df1, df2, expected = dfs with pytest.raises(ValueError, match="'left_df' should be GeoDataFrame"): res = sjoin(df1.geometry, df2) with pytest.raises(ValueError, match="'right_df' should be GeoDataFrame"): res = sjoin(df1, df2.geometry)
def test_no_overlapping_geometry(self): # Note: these tests are for correctly returning GeoDataFrame # when result of the join is empty df_inner = sjoin(self.pointdf.iloc[17:], self.polydf, how='inner') df_left = sjoin(self.pointdf.iloc[17:], self.polydf, how='left') df_right = sjoin(self.pointdf.iloc[17:], self.polydf, how='right') # Recent Pandas development has introduced a new way of handling merges # this change has altered the output when no overlapping geometries if str(pd.__version__) > LooseVersion('0.18.1'): right_idxs = pd.Series(range(0, 5), name='index_right', dtype='int64') else: right_idxs = pd.Series(name='index_right', dtype='int64') expected_inner_df = pd.concat( [self.pointdf.iloc[:0], pd.Series(name='index_right', dtype='int64'), self.polydf.drop('geometry', axis=1).iloc[:0]], axis=1) expected_inner = GeoDataFrame( expected_inner_df, crs={'init': 'epsg:4326', 'no_defs': True}) expected_right_df = pd.concat( [self.pointdf.drop('geometry', axis=1).iloc[:0], pd.concat([pd.Series(name='index_left', dtype='int64'), right_idxs], axis=1), self.polydf], axis=1) expected_right = GeoDataFrame( expected_right_df, crs={'init': 'epsg:4326', 'no_defs': True})\ .set_index('index_right') expected_left_df = pd.concat( [self.pointdf.iloc[17:], pd.Series(name='index_right', dtype='int64'), self.polydf.iloc[:0].drop('geometry', axis=1)], axis=1) expected_left = GeoDataFrame( expected_left_df, crs={'init': 'epsg:4326', 'no_defs': True}) assert expected_inner.equals(df_inner) assert expected_right.equals(df_right) assert expected_left.equals(df_left)
def test_empty_join(self): # Check empty joins polygons = geopandas.GeoDataFrame({'col2': [1, 2], 'geometry': [Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), Polygon([(1, 0), (2, 0), (2, 1), (1, 1)]) ]}) not_in = geopandas.GeoDataFrame({'col1': [1], 'geometry': [Point(-0.5, 0.5)]}) empty = sjoin(not_in, polygons, how='left', op='intersects') assert empty.index_right.isnull().all() empty = sjoin(not_in, polygons, how='right', op='intersects') assert empty.index_left.isnull().all() empty = sjoin(not_in, polygons, how='inner', op='intersects') assert empty.empty
def assign_taxi_zones(df, lon_var, lat_var, locid_var): """Joins DataFrame with Taxi Zones shapefile. This function takes longitude values provided by `lon_var`, and latitude values provided by `lat_var` in DataFrame `df`, and performs a spatial join with the NYC taxi_zones shapefile. The shapefile is hard coded in, as this function makes a hard assumption of latitude and longitude coordinates. It also assumes latitude=0 and longitude=0 is not a datapoint that can exist in your dataset. Which is reasonable for a dataset of New York, but bad for a global dataset. Only rows where `df.lon_var`, `df.lat_var` are reasonably near New York, and `df.locid_var` is set to np.nan are updated. Parameters ---------- df : pandas.DataFrame or dask.DataFrame DataFrame containing latitudes, longitudes, and location_id columns. lon_var : string Name of column in `df` containing longitude values. Invalid values should be np.nan. lat_var : string Name of column in `df` containing latitude values. Invalid values should be np.nan locid_var : string Name of column in `df` containing taxi_zone location ids. Rows with valid, nonzero values are not overwritten. """ import geopandas from shapely.geometry import Point localdf = df[[lon_var, lat_var, locid_var]].copy() # localdf = localdf.reset_index() localdf[lon_var] = localdf[lon_var].fillna(value=0.) localdf[lat_var] = localdf[lat_var].fillna(value=0.) localdf['replace_locid'] = (localdf[locid_var].isnull() & (localdf[lon_var] != 0.) & (localdf[lat_var] != 0.)) if (np.any(localdf['replace_locid'])): shape_df = geopandas.read_file('../shapefiles/taxi_zones.shp') shape_df.drop(['OBJECTID', "Shape_Area", "Shape_Leng", "borough", "zone"], axis=1, inplace=True) shape_df = shape_df.to_crs({'init': 'epsg:4326'}) try: local_gdf = geopandas.GeoDataFrame( localdf, crs={'init': 'epsg:4326'}, geometry=[Point(xy) for xy in zip(localdf[lon_var], localdf[lat_var])]) local_gdf = geopandas.sjoin( local_gdf, shape_df, how='left', op='within') return local_gdf.LocationID.rename(locid_var) except ValueError as ve: print(ve) print(ve.stacktrace()) return df[locid_var] else: return df[locid_var]
def test_sjoin_left(self): df = sjoin(self.pointdf, self.polydf, how='left') assert df.shape == (21, 8) for i, row in df.iterrows(): assert row.geometry.type == 'Point' assert 'pointattr1' in df.columns assert 'BoroCode' in df.columns
def test_sjoin_left(self): df = sjoin(self.pointdf, self.polydf, how='left') self.assertEquals(df.shape, (21,8)) for i, row in df.iterrows(): self.assertEquals(row.geometry.type, 'Point') self.assertTrue('pointattr1' in df.columns) self.assertTrue('BoroCode' in df.columns)
def test_sjoin_named_index(self, how): #original index names should be unchanged pointdf2 = self.pointdf.copy() pointdf2.index.name = 'pointid' df = sjoin(pointdf2, self.polydf, how=how) assert pointdf2.index.name == 'pointid' assert self.polydf.index.name == None
def test_sjoin_inner(self): # GH637 countries = self.world[["geometry", "name"]] countries = countries.rename(columns={"name": "country"}) cities_with_country = sjoin(self.cities, countries, how="inner", op="intersects") assert cities_with_country.shape == (172, 4)
def test_geometry_name(self): # test sjoin is working with other geometry name polydf_original_geom_name = self.polydf.geometry.name self.polydf = (self.polydf.rename(columns={'geometry': 'new_geom'}) .set_geometry('new_geom')) assert polydf_original_geom_name != self.polydf.geometry.name res = sjoin(self.polydf, self.pointdf, how="left") assert self.polydf.geometry.name == res.geometry.name
def test_geometry_name(self): # test sjoin is working with other geometry name polydf_original_geom_name = self.polydf.geometry.name self.polydf = (self.polydf.rename(columns={'geometry': 'new_geom'}) .set_geometry('new_geom')) self.assertNotEqual(polydf_original_geom_name, self.polydf.geometry.name) res = sjoin(self.polydf, self.pointdf, how="left") self.assertEqual(self.polydf.geometry.name, res.geometry.name)
def test_no_overlapping_geometry(self): # Note: these tests are for correctly returning GeoDataFrame # when result of the join is empty df_inner = sjoin(self.pointdf.iloc[17:], self.polydf, how='inner') df_left = sjoin(self.pointdf.iloc[17:], self.polydf, how='left') df_right = sjoin(self.pointdf.iloc[17:], self.polydf, how='right') expected_inner_df = pd.concat( [self.pointdf.iloc[:0], pd.Series(name='index_right', dtype='int64'), self.polydf.drop('geometry', axis=1).iloc[:0]], axis=1) expected_inner = GeoDataFrame( expected_inner_df, crs={'init': 'epsg:4326', 'no_defs': True}) expected_right_df = pd.concat( [self.pointdf.drop('geometry', axis=1).iloc[:0], pd.concat([pd.Series(name='index_left', dtype='int64'), pd.Series(name='index_right', dtype='int64')], axis=1), self.polydf], axis=1) expected_right = GeoDataFrame( expected_right_df, crs={'init': 'epsg:4326', 'no_defs': True})\ .set_index('index_right') expected_left_df = pd.concat( [self.pointdf.iloc[17:], pd.Series(name='index_right', dtype='int64'), self.polydf.iloc[:0].drop('geometry', axis=1)], axis=1) expected_left = GeoDataFrame( expected_left_df, crs={'init': 'epsg:4326', 'no_defs': True}) assert expected_inner.equals(df_inner) assert expected_right.equals(df_right) assert expected_left.equals(df_left)
def test_inner(self, op, dfs): index, df1, df2, expected = dfs res = sjoin(df1, df2, how='inner', op=op) exp = expected[op].dropna().copy() exp = exp.drop('geometry_y', axis=1).rename( columns={'geometry_x': 'geometry'}) exp[['df1', 'df2']] = exp[['df1', 'df2']].astype('int64') if index == 'default-index': exp[['index_left', 'index_right']] = \ exp[['index_left', 'index_right']].astype('int64') exp = exp.set_index('index_left') exp.index.name = None assert_frame_equal(res, exp)
def test_right(self, op, dfs): index, df1, df2, expected = dfs res = sjoin(df1, df2, how='right', op=op) exp = expected[op].dropna(subset=['index_right']).copy() exp = exp.drop('geometry_x', axis=1).rename( columns={'geometry_y': 'geometry'}) exp['df2'] = exp['df2'].astype('int64') if index == 'default-index': exp['index_right'] = exp['index_right'].astype('int64') res['index_left'] = res['index_left'].astype(float) exp = exp.set_index('index_right') exp = exp.reindex(columns=res.columns) assert_frame_equal(res, exp, check_index_type=False)
def test_left(self, op, dfs): index, df1, df2, expected = dfs res = sjoin(df1, df2, how='left', op=op) exp = expected[op].dropna(subset=['index_left']).copy() exp = exp.drop('geometry_y', axis=1).rename( columns={'geometry_x': 'geometry'}) exp['df1'] = exp['df1'].astype('int64') if index == 'default-index': exp['index_left'] = exp['index_left'].astype('int64') # TODO: in result the dtype is object res['index_right'] = res['index_right'].astype(float) exp = exp.set_index('index_left') exp.index.name = None assert_frame_equal(res, exp)
def test_crs_mismatch(self, dfs): index, df1, df2, expected = dfs df1.crs = {'init': 'epsg:4326', 'no_defs': True} with pytest.warns(UserWarning): sjoin(df1, df2)
def test_sjoin_outer(self): df = sjoin(self.pointdf, self.polydf, how="outer") assert df.shape == (21, 8)
def city_directions(): """ Purpose: To create a path between any two USA cities using the NA roads dataset Input: None Output: The path distance in miles OR -1 if no path exists """ # data looks like this: [ "cityname1", "cityname2" ] citySource, cityDest = request.args.get("cityArgs", None).split(',') citySource = (unquote(citySource)).title() cityDest = (unquote(cityDest)).title() source_target_list = [] # only run if both source and destination cities are populated with text if citySource and cityDest: source_target_list = load_city_docs(citySource, cityDest) # identify the source city source_city = source_target_list[0] source_city_coords = tuple(source_city['geometry']['coordinates']) source_city_name = source_city['properties']['name'] # locate the nearest road segment(s) to source city nearest_roads_to_source = [ list(linestring.coords) for linestring in list(usrails_and_roads_DF.iloc[list( usrail_and_roads_SI.nearest((source_city_coords[0], source_city_coords[1], source_city_coords[0], source_city_coords[1]), num_results=1))].geometry) ] target_city = source_target_list[1] target_city_coords = tuple(target_city['geometry']['coordinates']) target_city_name = target_city['properties']['name'] # find the closest road segment(s) to the target city nearest_roads_to_target = [ list(linestring.coords) for linestring in list(usrails_and_roads_DF.iloc[list( usrail_and_roads_SI.nearest((target_city_coords[0], target_city_coords[1], target_city_coords[0], target_city_coords[1]), num_results=1))].geometry) ] # adds the source city to the NA roads graph if source_city_coords not in US_road_graph: add_city_to_graph(source_city_coords, source_city_name, nearest_roads_to_source, US_road_graph) # adds the target city to the NA roads graph if target_city_coords not in US_road_graph: add_city_to_graph(target_city_coords, target_city_name, nearest_roads_to_target, US_road_graph) # if a path exists between the two cities, create a geojson feature of it if has_path(US_road_graph, source_city_coords, target_city_coords): total_distance, path = single_source_dijkstra(US_road_graph, source_city_coords, target_city_coords, weight=distance) path_geojson = { 'type': 'feature', 'properties': { 'source': source_city_name, 'destination': target_city_name, 'distance': total_distance }, 'geometry': { 'type': 'LineString', 'coordinates': [list(point) for point in path] } } # load the path into a GeoDataFrame for processing path_df = GeoDataFrame.from_features([path_geojson], crs="epsg:4326") # use the buffer method to produce a Polygon of 0.2 degrees thickness surrounding the path buffered_path = (path_df.buffer(0.2)).to_crs(crs="epsg:4326") # create a dataframe from the buffered path buffered_path_df = GeoDataFrame(buffered_path, geometry=buffered_path.geometry) buffered_path_df[0] = None # perform a spatial join of the buffered path and the ufo sightings, earthquakes, etc dataframe. # This will return all disasters within 0.2 degrees of the path join_results = GeoDataFrame( sjoin(disasters_DF, buffered_path_df, lsuffix="left")) # from here, dump the path, the buffered path, and the disasters 0.2 degrees from the path to files # for the front end to visualize dump( path_geojson, open( './Assignments/A05/assets/api/data/shortest_paths/' + source_city_name + '_' + target_city_name + '.geojson', 'w')) dump( loads(buffered_path.to_json()), open( './Assignments/A05/assets/api/data/shortest_paths/buffered.geojson', 'w')) dump( loads(join_results.to_json(show_bbox=False)), open( './Assignments/A05/assets/api/data/shortest_paths/closest_points.geojson', 'w')) return str(total_distance) else: return "-1"
def test_sjoin_bad_op(self): # AttributeError: 'Point' object has no attribute 'spandex' with pytest.raises(ValueError): sjoin(self.pointdf, self.polydf, how="left", op="spandex")
def flows_catchs(): ## Identfy catchments and flowlines to HUCs ## Find the HUC12s that intersect with the input polygon #shapefile = 'data/TX-Counties/Young/TX-County-Young.shp' shape = gpd.read_file(args.input) if args.huc12: #huc12 = 'data/WBD_National_GDB/WBD_National_GDB.shp/WBDHU12.shp' ## TODO: Extend to any HUC input hucs = gpd.read_file(args.huc12, mask=shape) ## TODO: Make separate flowline and catchment input optional if args.nhd: hucs = hucs[['HUC12', 'geometry']] elif args.hydrobasins: hucs = gpd.read_file(args.hydrobasins, mask=shape) ## TODO: Make separate flowline and catchment input optional if args.hydrosheds_basins: hucs = hucs[['HYBAS_ID', 'geometry']] ## TODO: Make separate flowline and catchment input optional #elif args.hydrobasins_basins: else: raise (ValueError("Missing basin data")) #nhd = 'data/NFIEGeo_12.gdb' ## Find the flowlines whose representative points are in these HUC12s if args.nhd: flows = gpd.read_file(args.nhd, layer='Flowline', mask=hucs) elif args.hydrosheds_rivers: flows = gpd.read_file(args.hydrosheds_rivers, mask=hucs) ## TODO: Make separate flowline and catchment input optional else: raise (ValueError("Missing flowline data")) flows.drop(columns=[ 'Shape_Length', 'Shape_Area', 'AreaSqKM', 'index_left', 'index_right' ], inplace=True, errors='ignore') flows.reset_index(inplace=True) flows.set_index('COMID', inplace=True) flows.sort_index(inplace=True) flows_rep = flows.copy() flows_rep['geometry'] = flows.representative_point() if flows_rep.crs != hucs.crs: flows_rep = gpd.sjoin(flows_rep, hucs.to_crs(flows_rep.crs), op='intersects', how='inner') else: flows_rep = gpd.sjoin(flows_rep, hucs, op='intersects', how='inner') flows_rep.drop(columns=['index_left', 'index_right'], inplace=True, errors='ignore') ## Find the catchments corresponding with these flowlines catchs = gpd.read_file(args.nhd, layer='Catchment') catchs.drop(columns=['index_left', 'index_right'], inplace=True, errors='ignore') catchs.reset_index(inplace=True) catchs.set_index('FEATUREID', inplace=True) catchs.sort_index(inplace=True) catchs = catchs[catchs.index.isin(flows_rep.index)] ## Find the flowlines corresponding with these cactchments ## (Note: this line is optional. ## Commenting it out will result in non-COMID-identified flowlines) flows = flows[flows.index.isin(catchs.index)] ## Determine which HUC12s each of the flowlines and catchments belong to flows.loc[flows.index, 'HUC12'] = flows_rep.loc[flows.index, 'HUC12'] catchs.loc[catchs.index, 'HUC12'] = flows.loc[catchs.index, 'HUC12'] flows.loc[flows['StreamOrde'] == 0, 'Roughness'] = .99 flows.loc[flows['StreamOrde'] == 1, 'Roughness'] = .2 flows.loc[flows['StreamOrde'] == 2, 'Roughness'] = .1 flows.loc[flows['StreamOrde'] == 3, 'Roughness'] = .065 flows.loc[flows['StreamOrde'] == 4, 'Roughness'] = .045 flows.loc[flows['StreamOrde'] == 5, 'Roughness'] = .03 flows.loc[flows['StreamOrde'] == 6, 'Roughness'] = .01 flows.loc[flows['StreamOrde'] == 7, 'Roughness'] = .025 flows = flows[flows.is_valid] catchs = catchs[catchs.is_valid] catchs = catchs[catchs.index.isin(flows.index)] flows = flows[flows.index.isin(catchs.index)] return (flows, catchs)
def test_sjoin_duplicate_column_name(self): pointdf2 = self.pointdf.rename(columns={'pointattr1': 'Shape_Area'}) df = sjoin(pointdf2, self.polydf, how="left") self.assertTrue('Shape_Area_left' in df.columns) self.assertTrue('Shape_Area_right' in df.columns)
import numpy as np import pandas as pd import os import geopandas as gpd #This script appends information on the ADM1 region that contains each Cities4Forest city and saves it to a new file. #Change to directory working_dir = '/Users/kristine/WRI/Cities4Forests/Defra_Watersheds' os.chdir(working_dir) #Load city locations city_locations_file = '/Users/kristine/WRI/Cities4Forests/Defra_Watersheds/Cities4Forests Watersheds/Cities4Forests Watersheds.shp' city_locations = gpd.read_file(city_locations_file) #Load adm1 polygons adm1_file = '/Users/kristine/WRI/gadm36_levels_shp/gadm36_1.shp' adm1 = gpd.read_file(adm1_file) #Spatially join files, which preserves the geometry of the first dataset, while adding the attributes of the right dataset merged_files = gpd.sjoin(city_locations, adm1[['NAME_0','NAME_1', 'geometry']], how='left', op='intersects') #Select desired columns merged_files = merged_files[['City', 'City Simpl', 'Country', 'Watersheds', 'Latitude', 'Longitude', 'Tree Cover', 'Biomass Lo', 'Restoratio', 'Carbon Seq', 'Aqueduct W', 'Overlaps w', 'Watershed','NAME_0', 'NAME_1']] #Save to a CSV file merged_files.to_csv('Cities4Forests Watersheds with ADM1.csv',index=False)
Polygon([(XleftOrigin, Ytop), (XrightOrigin, Ytop), (XrightOrigin, Ybottom), (XleftOrigin, Ybottom)])) Ytop = Ytop - height Ybottom = Ybottom - height XleftOrigin = XleftOrigin + width XrightOrigin = XrightOrigin + width #Nous créons un géodataframe de type polygone grid = gpd.GeoDataFrame({'geometry': polygons}) #Déclaration d'un système de projection (4326 = système WGS 84) grid.set_crs(epsg=4326, inplace=True) #Ce coefficient servira pour créer des barres visibles à grande échelle sur la carte #MODIFIABLE coef_exageration = 500 #Jointure spatiale entre les points et les cellules de la grille que nous avons créé #Autrement dit : on associe les points avec la cellule dans laquelle ces points se trouvent. Ainsi, pour chaque point, #nous créons une cellule supplémentaire dfsjoin = gpd.sjoin(grid, points) #Spatial join Points to polygons #On donne un id à chaque cellule associée dfsjoin["id"] = dfsjoin.index + 1 # On fusionne les cellules par id et nous comptons le nombre de cellules fusionnées, ce qui nous donne le nombre de points dans la cellule. dataFinal = dfsjoin.dissolve(by='id', aggfunc='count') #on ajoute une variable hauteur, basé sur le champ count (ce qui servira pour l'extrusion des polygones sur la carte) #dataFinal['height'] = dataFinal['name']*coef_exageration #Renommage du champ name en value. Le champ name contient le compte des points par cellule. Nous le renommons. dataFinal = dataFinal.rename(columns={'name': 'value'}) #A MODIFIER dataFinal.to_file("./departements-france-2020-12-21.geojson", driver="GeoJSON")
empty_grid.reset_index(drop=True, inplace=True) grid = empty_grid ### #adm #coords_geom = [Point(xy) for xy in zip(grid.lon, grid.lat)] #coords_df = gpd.GeoDataFrame(grid['cell_id'], crs='epsg:4326', geometry=coords_geom) #del coords_geom adm = gpd.read_file(path + "/gadm36_AFG_2.geojson") adm = adm[["GID_1", "NAME_1", "GID_2", "NAME_2", "geometry"]] grid = gpd.sjoin(grid, adm, how="left") grid = grid[~grid.index.duplicated(keep="first")] grid.drop(["index_right"], axis=1, inplace=True) #temp = temp[["cell_id", "GID_1", "NAME_1", "GID_2", "NAME_2"]] #idx = temp["cell_id"].duplicated()*1 #temp = temp.loc[idx==0, : ] #grid = grid.merge(temp, how="left", on="cell_id") #grid.drop(grid[grid.NAME_2.isnull()].index, inplace=True) #grid.reset_index(drop=True, inplace=True)
def osm_for_region(i, ret_df, key, tags, dist, area): _addresses = addresses[addresses.Reg_Code == i] print("Grabbing data in region {} of 5".format(i + 1)) pbar = '--------------------------------------------------' prog = '| {0:.0%}'.format(0) # 2. Load building data for the region if len(_addresses) > 0: # merge all building data from the South/West if key == 'building': if i == 3: path = USA[key][i] gdf = gdf_from_json(path) gdf1 = gdf_from_json(USA[key][4]) gdf2 = gdf_from_json(USA[key][5]) gdf3 = gdf_from_json(USA[key][6]) gdf = gdf.append([gdf1, gdf2, gdf3], ignore_index=True) elif i == 4: path = USA[key][7] gdf = gdf_from_json(path) gdf1 = gdf_from_json(USA[key][8]) gdf2 = gdf_from_json(USA[key][9]) gdf3 = gdf_from_json(USA[key][10]) gdf = gdf.append([gdf1, gdf2, gdf3], ignore_index=True) else: path = USA[key][i] gdf = gdf_from_json(path) else: path = USA[key][i] gdf = gdf_from_json(path) print(pbar + prog, end='\r') else: print("No addresses in region {}".format(i + 1)) progress = 0 for k in _addresses.index: progress += 1 # Create new vector to append at end of iteration coords = _addresses['Lat_Lon'][k] ID = _addresses['ID'][k] Add = _addresses['Address'][k] Shi = _addresses['Shipping'][k] df = pd.DataFrame({ 'ID': [ID], 'Address': [Add], 'Lat_Lon': [coords], 'Shipping': [Shi] }) # 3. Create bounding box as GeoDataFrame b = fp.bbox_from_point(coords, dist) # 4. Filter gdf by bounding box if key == 'highway': # To get intersecting highways, we cannot use standard cx filtering bb = box(b[3], b[1], b[2], b[0]) bb = Polygon(bb) bbox = gpd.GeoSeries([bb]) bbox = gpd.GeoDataFrame({'geometry': bbox}) # Filter gdf for all data intersecting Polygon bbox osm_in_bbox = gpd.sjoin(bbox, gdf, how='left', op='intersects') else: osm_in_bbox = gdf.cx[b[3]:b[2], b[1]:b[0]] for tag in tags: x = osm_in_bbox[osm_in_bbox[key] == tag] y = tag + '_' + key[0].upper() df[y] = [len(x)] if area == True: t = y + "_area" df[t] = [sum(x['geometry'].area)] # 6. Store as new columns in DataFrame ret_df = ret_df.append(df) return ret_df
file2 = drive.CreateFile({'id': '1tkhDiyW4eHp9IGyFmH4iO9_F1lrj701h'}) file2.GetContentFile('landkreise.shp') file3 = drive.CreateFile({'id': '1RE5rCRsw_hRH7nZkNLYljEGbPiN63_eW'}) file3.GetContentFile('landkreise.shx') file4 = drive.CreateFile({'id': '1vh5JNv8UvmIKWXRecHrvzWnY8qSrlbj6'}) file4.GetContentFile('landkreise.cpg') file5 = drive.CreateFile({'id': '156d84fK3IuC_comNOnO7f4nPt9b-P1Zm'}) file5.GetContentFile('landkreise.dbf') file6 = drive.CreateFile({'id': '1YQh84mF2qiiHX0j_fcuZa-xkWaLOtPZK'}) file6.GetContentFile('landkreise.prj') df_kreise = gpd.read_file('landkreise.shp') df_kreise.crs = {'init': 'epsg:4326'} # match the two matched_kreise = gpd.sjoin(geo_google_id_df, df_kreise, how='right') # treatment test_kreise = matched_kreise.groupby( 'WARNCELLID').index_left.count().value_counts().sort_index() treat_kreise = matched_kreise.groupby('WARNCELLID').apply( lambda x: x.sample(frac=0.5, random_state=1))['index_left'].to_frame() treat_kreise['treatment'] = 1 matched_kreise = matched_kreise.merge(treat_kreise, on='index_left', how='left') matched_kreise['treatment'] = matched_kreise.treatment.fillna(value=0) #matched_kreise.groupby('treatment').count() #points_kreise = matched_kreise[['Longitude','Latitude','treatment','Name']].dropna() #geo_points_kreise = gpd.GeoDataFrame(points_kreise, geometry=gpd.points_from_xy(points_kreise.Longitude, points_kreise.Latitude))
def output_files(arguments): #def output(flow_key,flowshu12shape,catchshu12shape,hu12catchs,avail_hu12catchs_group,args,prefix,dst_crs,mem_estimates): ## Output catchments, flowlines, roughnesses, and rasters try: def output_nhd(flows, catchs, hu): ## For each HUC, write the flowlines, catchments, and roughnesses corresponding to it out_path = os.path.join(subdirectory, 'Flowlines.shp') my_file = Path(out_path) #if my_file.is_file() and not arguments[1].args.overwrite and not arguments[1].args.overwrite_flowlines: if (my_file.is_file() and not arguments[5].overwrite and not arguments[5].overwrite_flowlines): #if my_file.is_file() and not args.overwrite and not args.overwrite_flowlines: pass else: my_file.unlink(missing_ok=True) #flowshu12shape[flowshu12shape['HUC12']==hu].reset_index().to_file(out_path) flows.reset_index().to_file(out_path) out_path = os.path.join(subdirectory, 'Roughness.csv') my_file = Path(out_path) #if my_file.is_file() and not arguments[1].args.overwrite and not arguments[1].args.overwrite_roughnesses: if (my_file.is_file() and not arguments[5].overwrite and not arguments[5].overwrite_roughnesses): #if my_file.is_file() and not args.overwrite and not args.overwrite_roughnesses: pass else: my_file.unlink(missing_ok=True) with open(out_path, 'w', newline='') as outcsv: writer = csv.writer(outcsv) writer.writerow(['COMID', 'StreamOrde', 'Roughness']) for comid in np.sort(flows.index.unique()): writer.writerow([ comid, flows.loc[comid, 'StreamOrde'], flows.loc[comid, 'Roughness'] ]) out_path = os.path.join(subdirectory, 'Catchments.shp') my_file = Path(out_path) #if my_file.is_file() and not arguments[1].args.overwrite and not arguments[1].args.overwrite_catchments: if (my_file.is_file() and not arguments[5].overwrite and not arguments[5].overwrite_catchments): #if my_file.is_file() and not args.overwrite and not args.overwrite_catchments: pass else: my_file.unlink(missing_ok=True) #catchshu12shape[catchshu12shape['HUC12']==hu].reset_index().to_file(out_path) catchs.reset_index().to_file(out_path) def get_mosaic(avail_hucscatchs_group, hu, break_hu, dst_crs): ## Get mosaic of DEMs for each HUC def append_check(src_files_to_mosaic, var, subdirectory, hu): ## Check each raster's resolution in this HUC if any(np.float16(i) > 1. for i in var.res): out_path = os.path.join(subdirectory, "gt1m.err") Path(out_path).touch() print('WARNING: >1m raster input for HUC12: ' + str(hu)) sys.stdout.flush() else: src_res_min_to_mosaic.append(min(var.res)) src_res_max_to_mosaic.append(min(var.res)) src_x_to_mosaic.append(var.res[0]) src_y_to_mosaic.append(var.res[1]) src_files_to_mosaic.append(var) return (src_files_to_mosaic, src_res_min_to_mosaic, src_res_max_to_mosaic, src_x_to_mosaic, src_y_to_mosaic) ## Reproject the mosaic to DEM tiles pertaining to each HUC dem_fps = list(avail_hucscatchs_group['stampede2name']) src_files_to_mosaic = [] src_res_min_to_mosaic = [] src_res_max_to_mosaic = [] src_x_to_mosaic = [] src_y_to_mosaic = [] memfile = {} for fp in dem_fps: memfile[fp] = MemoryFile() for fp in dem_fps: with rasterio.open(fp) as src: transform, width, height = calculate_default_transform( src.crs, dst_crs, src.width, src.height, *src.bounds) out_meta = src.meta.copy() out_meta.update({ 'crs': dst_crs, 'transform': transform, 'width': width, 'height': height }) ## Don't do an expensive reprojection if projection ## already correct ## TODO: This with statement may need to be changed ## back to an equals with memfile[fp].open(**out_meta) as dst: if src.meta == out_meta: dst.write(src.read()) else: for i in range(1, src.count + 1): reproject(source=rasterio.band(src, i), destination=rasterio.band(dst, i), src_transform=src.transform, src_crs=src.crs, dst_transform=dst.transform, dst_crs=dst.crs, resampling=Resampling.nearest) src_files_to_mosaic, src_res_min_to_mosaic, src_res_max_to_mosaic, src_x_to_mosaic, src_y_to_mosaic = append_check(src_files_to_mosaic, dst, subdirectory, hu) if len(src_files_to_mosaic) == 0: out_path = os.path.join(subdirectory, "allGT1m.err") Path(out_path).touch() print('WARNING: Found no <=1m raster input data for HUC12: ' + str(hu)) sys.stdout.flush() break_hu = True mosaic_tuple = () return (break_hu, mosaic_tuple) else: src_files_to_mosaic = pd.DataFrame( data={ 'Files': src_files_to_mosaic, 'min(resolution)': src_res_min_to_mosaic, 'max(resolution)': src_res_max_to_mosaic }) src_files_to_mosaic.sort_values( by=['min(resolution)', 'max(resolution)'], inplace=True) mosaic, out_trans = merge(list(src_files_to_mosaic['Files']), res=(max(src_x_to_mosaic), max(src_y_to_mosaic))) for src in src_files_to_mosaic['Files']: src.close() out_meta = src.meta.copy() out_meta.update({ "driver": 'GTiff', "height": mosaic.shape[1], "width": mosaic.shape[2], "transform": out_trans, "crs": dst_crs }) for keyvalue in memfile.items(): keyvalue[1].close() mosaic_tuple = (mosaic, out_meta) return (break_hu, mosaic_tuple) def output_raster(hu_buff_geom, mosaic, out_meta, path_elevation): ## Crop and output the mosaic to the buffered catchments of each HUC with MemoryFile() as memfile: with memfile.open(**out_meta) as dataset: dataset.write(mosaic) with memfile.open(**out_meta) as dataset: out_image, out_trans = rasterio.mask.mask(dataset, hu_buff_geom, crop=True) out_meta.update({ "height": out_image.shape[1], "width": out_image.shape[2], "transform": out_trans }) with rasterio.open(path_elevation, "w", **out_meta) as dst: dst.write(out_image) #subdirectory = os.path.join(arguments[1].args.directory, arguments[1].prefix+'-'+str(arguments[0])) subdirectory = os.path.join(arguments[5].directory, arguments[6] + '-' + str(arguments[0])) #subdirectory = os.path.join(args.directory, prefix+'-'+str(flow_key)) Path(subdirectory).mkdir(parents=True, exist_ok=True) path_notime = os.path.join(subdirectory, "jobNoTimeLeftWhileProcessing.err") Path(path_notime).touch() path_gt1m = os.path.join(subdirectory, "allGT1m.err") file_gt1m = Path(path_gt1m) path_enclose = os.path.join(subdirectory, "rasterDataDoesNotEnclose.err") file_enclose = Path(path_enclose) if file_gt1m.is_file() or file_enclose.is_file(): pass else: #output_nhd(arguments[1].flowshu12shape,arguments[1].catchshu12shape,arguments[0]) output_nhd(arguments[1], arguments[2], arguments[0]) #output_nhd(flowshu12shape,catchshu12shape,flow_key) path_elevation = os.path.join(subdirectory, 'Elevation.tif') file_elevation = Path(path_elevation) #if file_elevation.is_file() and not arguments[1].args.overwrite and not arguments[1].args.overwrite_rasters: if (file_elevation.is_file() and not arguments[5].overwrite and not arguments[5].overwrite_rasters): #if file_elevation.is_file() and not args.overwrite and not args.overwrite_rasters: pass else: file_elevation.unlink(missing_ok=True) #avail_hu12catchs_group = arguments[1].avail_hu12catchs_grouped.get_group(arguments[0]) break_hu = False #break_hu, mosaic_tuple = get_mosaic(avail_hu12catchs_group,arguments[0],break_hu,arguments[1].dst_crs) break_hu, mosaic_tuple = get_mosaic(arguments[4], arguments[0], break_hu, arguments[7]) #break_hu, mosaic_tuple = get_mosaic(avail_hu12catchs_group,flow_key,break_hu,dst_crs) if break_hu != True: with rasterio.Env(): results = ({ 'properties': { 'Elevation': v }, 'geometry': s } for i, (s, v) in enumerate( shapes((mosaic_tuple[0] == mosaic_tuple[1] ['nodata']).astype(np.int16), mask=mosaic_tuple[0] != mosaic_tuple[1] ['nodata'], transform=mosaic_tuple[1]['transform']))) geoms = list(results) raster = gpd.GeoDataFrame.from_features( geoms, crs=mosaic_tuple[1]['crs']) #hu_buff = arguments[1].hu12catchs.loc[[arguments[0]]].drop(columns=['index_left','index_right'],errors='ignore').to_crs(mosaic_tuple[1]['crs']) hu_buff = arguments[3].to_crs(mosaic_tuple[1]['crs']) #hu_buff = hu12catchs.to_crs(mosaic_tuple[1]['crs']) hu_buff_geom = list(hu_buff['geometry']) if len( gpd.sjoin(hu_buff, raster, op='within', how='inner').index) == 0: out_path = os.path.join( subdirectory, "rasterDataDoesNotEnclose.err") Path(out_path).touch() print( 'WARNING: <=1m raster input data does not enclose HUC12: ' + str(arguments[0])) #print('WARNING: <=1m raster input data does not enclose HUC12: '+str(flow_key)) sys.stdout.flush() else: #print('GOING IN OUTPUT RASTER\t',arguments[0]) #print('GOING IN OUTPUT RASTER\t',flows_key) output_raster(hu_buff_geom, mosaic_tuple[0], mosaic_tuple[1], path_elevation) #mem_estimates[flows_key] = 0. Path(path_notime).unlink() except OSError as e: Path(path_notime).unlink() out_path = os.path.join(subdirectory, "OS.err") Path(out_path).touch() with open(out_path, 'w') as f: #f.write("{}".format(e)) f.write(str(e)) print('[ERROR] OSError on HUC12: ' + str(arguments[0])) print(e) sys.stdout.flush() #if arguments[1].args.log: if arguments[5].log: #if args.log: logging.debug('[ERROR] OSError on HUC ' + str(arguments[0])) #logging.debug('HUC '+str(flow_key)) except Exception as e: #if arguments[1].args.log: if arguments[5].log: #if args.log: logging.debug('[EXCEPTION] on HUC ' + str(arguments[0])) #logging.debug('HUC '+str(flow_key)) return (ExceptionWrapper(e))
def available(hucscatchs): ## Identify each DEM tile file for our study area ## Find the DEM tiles that intersect with these buffered HUC12 catchments #availibility = 'data/TNRIS-LIDAR-Availability-20191213.shp/TNRIS-LIDAR-Availability-20191213.shp' avail = gpd.read_file(args.availability) avail_hucscatchs = gpd.sjoin(avail, hucscatchs.to_crs(avail.crs), op='intersects', how='inner') ## Construct an exact path for each DEM tile fnexts = ['.dem', '.img'] for fnext in fnexts: avail_hucscatchs['demname'] = avail_hucscatchs['demname'].str.replace( fnext + '$', '') for dirname in avail_hucscatchs['dirname'].unique(): stampede2names = [] #raster = '/scratch/projects/tnris/tnris-lidardata' basename = os.path.join(args.lidar, dirname, 'dem') + os.sep for fnext in fnexts: avail_hucscatchs['demname'] = avail_hucscatchs[ 'demname'].str.replace(fnext + '$', '') stampede2names.extend(glob.glob(basename + '*' + fnext)) direxts = set([ os.path.splitext(os.path.basename(name))[1] for name in stampede2names ]) ## If more than one vector image extension found in a DEM project, ## then figure out each file's extension individually ## TODO: Test this against stratmap-2013-50cm-ellis-henderson-hill-johnson-navarro if len(direxts) > 1: for demname in avail_hucscatchs.loc[avail_hucscatchs['dirname'] == dirname, 'demname'].unique(): truth_dirname = avail_hucscatchs['dirname'] == dirname truth_demname = avail_hucscatchs['demname'] == demname truth = np.logical_and(truth_dirname, truth_demname) for fnext in fnexts: stampede2name = avail_hucscatchs.loc[ truth, 'demname'].apply( lambda x: os.path.join(basename, x + fnext)) if glob.glob(stampede2name.iloc[0]): break #else: avail_hucscatchs.loc[truth, 'stampede2name'] = stampede2name ## Else do all the files in a DEM project at once elif len(direxts) == 1: stampede2name = avail_hucscatchs.loc[ avail_hucscatchs['dirname'] == dirname, 'demname'].apply( lambda x: os.path.join(basename, x + list(direxts)[0])) stampede2name.drop_duplicates(inplace=True) p = Path(basename) for subp in p.rglob('*'): if len(stampede2name[stampede2name.str.lower() == str( subp).lower()].index) > 0: stampede2name.loc[stampede2name[stampede2name.str.lower( ) == subp.as_posix().lower()].index[0]] = subp.as_posix() stampede2name = stampede2name[stampede2name.isin( [subp.as_posix() for subp in list(p.rglob('*'))])] avail_hucscatchs.loc[avail_hucscatchs['dirname'] == dirname, 'stampede2name'] = stampede2name else: continue avail_hucscatchs.dropna(subset=['stampede2name'], inplace=True) avail_hucscatchs_grouped = avail_hucscatchs.groupby('index_right') return (avail_hucscatchs_grouped)
def travelshedwt(arrt): bk=bkpt.copy() if destination.loc[i,'direction']=='in': url=doserver+'otp/routers/default/isochrone?batch=true&mode=WALK,TRANSIT' url+='&fromPlace='+destination.loc[i,'latlong']+'&toPlace='+destination.loc[i,'latlong'] url+='&arriveBy=true&date='+typicaldate+'&time='+arrt+'&maxTransfers='+str(maxTransfers) url+='&maxWalkDistance='+str(maxWalkDistance)+'&clampInitialWait=-1'+cutoff headers={'Accept':'application/json'} req=requests.get(url=url,headers=headers) js=req.json() iso=gpd.GeoDataFrame.from_features(js,crs={'init': 'epsg:4326'}) bk['T'+arrt[0:2]+arrt[3:5]]=999 cut=range(cutoffend,cutoffstart,-cutoffinterval) if (iso.loc[iso['time']==cut[0]*60,'geometry'].notna()).bool(): try: bkiso=gpd.sjoin(bk,iso.loc[iso['time']==cut[0]*60],how='left',op='within') bkiso=bkiso.loc[pd.notnull(bkiso['time']),'blockid'] bk.loc[bk['blockid'].isin(bkiso),'T'+arrt[0:2]+arrt[3:5]]=cut[0]-cutoffinterval/2 except ValueError: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[0])+'-minute isochrone has no Census Block in it!') for k in range(0,(len(cut)-1)): if (iso.loc[iso['time']==cut[k+1]*60,'geometry'].notna()).bool(): if len(bk.loc[bk['T'+arrt[0:2]+arrt[3:5]]==cut[k]-cutoffinterval/2])!=0: try: bkiso=gpd.sjoin(bk.loc[bk['T'+arrt[0:2]+arrt[3:5]]==cut[k]-cutoffinterval/2], iso.loc[iso['time']==cut[k+1]*60],how='left',op='within') bkiso=bkiso.loc[pd.notnull(bkiso['time']),'blockid'] bk.loc[bk['blockid'].isin(bkiso),'T'+arrt[0:2]+arrt[3:5]]=cut[k+1]-cutoffinterval/2 except ValueError: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[k+1])+'-minute isochrone has no Census Block in it!') else: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[k])+'-minute isochrone has no Census Block in it!') else: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[k+1])+'-minute isochrone has no geometry!') else: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[0])+'-minute isochrone has no geometry!') bk['T'+arrt[0:2]+arrt[3:5]]=bk['T'+arrt[0:2]+arrt[3:5]].replace(999,np.nan) bk=bk.drop(['lat','long','geometry'],axis=1) bk=bk.set_index('blockid') return bk elif destination.loc[i,'direction']=='out': url=doserver+'otp/routers/default/isochrone?batch=true&mode=WALK,TRANSIT' url+='&fromPlace='+destination.loc[i,'latlong'] url+='&date='+typicaldate+'&time='+arrt+'&maxTransfers='+str(maxTransfers) url+='&maxWalkDistance='+str(maxWalkDistance)+'&clampInitialWait=0'+cutoff headers={'Accept':'application/json'} req=requests.get(url=url,headers=headers) js=req.json() iso=gpd.GeoDataFrame.from_features(js,crs={'init': 'epsg:4326'}) bk['T'+arrt[0:2]+arrt[3:5]]=999 cut=range(cutoffend,cutoffstart,-cutoffinterval) if (iso.loc[iso['time']==cut[0]*60,'geometry'].notna()).bool(): try: bkiso=gpd.sjoin(bk,iso.loc[iso['time']==cut[0]*60],how='left',op='within') bkiso=bkiso.loc[pd.notnull(bkiso['time']),'blockid'] bk.loc[bk['blockid'].isin(bkiso),'T'+arrt[0:2]+arrt[3:5]]=cut[0]-cutoffinterval/2 except ValueError: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[0])+'-minute isochrone has no Census Block in it!') for k in range(0,(len(cut)-1)): if (iso.loc[iso['time']==cut[k+1]*60,'geometry'].notna()).bool(): if len(bk.loc[bk['T'+arrt[0:2]+arrt[3:5]]==cut[k]-cutoffinterval/2])!=0: try: bkiso=gpd.sjoin(bk.loc[bk['T'+arrt[0:2]+arrt[3:5]]==cut[k]-cutoffinterval/2], iso.loc[iso['time']==cut[k+1]*60],how='left',op='within') bkiso=bkiso.loc[pd.notnull(bkiso['time']),'blockid'] bk.loc[bk['blockid'].isin(bkiso),'T'+arrt[0:2]+arrt[3:5]]=cut[k+1]-cutoffinterval/2 except ValueError: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[k+1])+'-minute isochrone has no Census Block in it!') else: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[k])+'-minute isochrone has no Census Block in it!') else: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[k+1])+'-minute isochrone has no geometry!') else: print(destination.loc[i,'id']+' '+arrt+' '+ str(cut[0])+'-minute isochrone has no geometry!') bk['T'+arrt[0:2]+arrt[3:5]]=bk['T'+arrt[0:2]+arrt[3:5]].replace(999,np.nan) bk=bk.drop(['lat','long','geometry'],axis=1) bk=bk.set_index('blockid') return bk
def test_sjoin_inner(self): df = sjoin(self.pointdf, self.polydf, how="inner") self.assertEquals(df.shape, (11, 8))
def go(acs_zips_csv, fac_path, shp_file, zip_to_zta_csv): df_acs = pd.read_csv(acs_zips_csv) df_acs['zip code tabulation area'] = df_acs[ 'zip code tabulation area'].apply(lambda x: '{0:0>5}'.format(x)) race_cat = { 'B02001_002E': 'white alone', 'B02001_003E': 'black alone', 'B02001_004E': 'native alone', 'B02001_005E': 'asian alone', 'B02001_006E': 'pacific alone', 'B02001_007E': 'other alone', 'B02001_008E': 'two or more', 'B02001_009E': 'two or more some other' } inc_cat = { 'B19001_002E': 'less10k', 'B19001_003E': '10kto15k', 'B19001_004E': '15kto20k', 'B19001_005E': '20kto25k', 'B19001_006E': '25kto30k', 'B19001_007E': '30kto35k', 'B19001_008E': '35kto40k', 'B19001_009E': '40kto45k', 'B19001_010E': '45kto50k', 'B19001_011E': '50kto55k', 'B19001_012E': '60kto75k', 'B19001_013E': '75kto100k', 'B19001_014E': '100kto125k', 'B19001_015E': '125kto145k', 'B19001_016E': '150kto200k', 'B19001_017E': '200kmore' } family_cat = { 'B11016_003E': '2 person', 'B11016_004E': '3 person', 'B11016_005E': '4 person', 'B11016_006E': '5 person', 'B11016_007E': '6 person', 'B11016_008E': '7plusperson' } ratio_pov_cat_fam = { 'B17026_002E': 'under_p5', 'B17026_003E': 'p5top74', 'B17026_004E': 'p75top99', 'B17026_005E': '1to1p24', 'B17026_006E': '1p25to1p49', 'B17026_007E': '1p50to1p74', 'B17026_008E': '1p75to1p84', 'B17026_009E': '1p85to1p99', 'B17026_010E': '2to2p99', 'B17026_011E': '3to3p99', 'B17026_012E': '4to4p99', 'B17026_013E': '5andover' } ratio_pov_cat_peop = { 'C17002_002E': 'under_p5', 'C17002_003E': 'p5top99', 'C17002_004E': '1to1p24', 'C17002_005E': '1p25to1p49', 'C17002_006E': '1p50to1p84', 'C17002_007E': '1p85to1p99', 'C17002_008E': '2andver' } pop_cat = {'B01003_001E': 'population'} med_inc_cat = {'B19013_001E': 'median income'} educ_cat = { 'B15003_002E': 'no school', 'B15003_003E': 'nursery', 'B15003_004E': 'kindergarten', 'B15003_005E': '1stgrade', 'B15003_006E': '2ndgrade', 'B15003_007E': '3rdgrade', 'B15003_008E': '4thgrade', 'B15003_009E': '5thgrade', 'B15003_010E': '6thgrade', 'B15003_011E': '7thgrade', 'B15003_012E': '8thgrade', 'B15003_013E': '9thgrade', 'B15003_014E': '10thgrade', 'B15003_015E': '11thgrade', 'B15003_016E': '12thgrade', 'B15003_017E': 'regular_hsd', 'B15003_018E': 'ged', 'B15003_019E': 'some college', 'B15003_020E': 'some college no degree', 'B15003_02E1': 'associate degree', 'B15003_022E': 'bachelor', 'B15003_023E': 'master', 'B15003_023E': 'professional school', 'B15003_024E': 'doctorate' } dcat = { 'B02001_00': race_cat, 'B19001_00': inc_cat, 'B11016_00': family_cat, 'B17026_00': ratio_pov_cat_fam, 'C17002_00': ratio_pov_cat_peop, 'C17002_00': ratio_pov_cat_peop } unique_cat = {'B01003_00': pop_cat, 'B19013_00': med_inc_cat} fac = exp.read_data(fac_path) geo_zcta = gpd.read_file(shp_file) fac = fac.astype({"ZIP_CODE": str}) fac['ZIP_CODE'] = fac['ZIP_CODE'].str.split('-').str[0] pref = { 'Race': 'B02001_00', 'Income': 'B19001_00', 'Family Size': 'B11016_00', 'Income Poverty': 'B17026_00' } zip_to_zta = pd.read_excel(zip_to_zta_csv, converters={ 'ZIP_CODE': '{:0>5}'.format, 'ZCTA': '{:0>5}'.format }) drop_fac = [ 'ALAND10', 'AWATER10', 'INTPTLAT10', 'INTPTLON10', 'LATITUDE83', 'LONGITUDE83', 'MTFCC10', 'PO_NAME', 'STATE', 'ZIP_TYPE', 'Zip_join_type', 'index_right' ] fac_miss, fac_nonmiss = af.keep_miss_nonmiss(fac, ['LONGITUDE83', 'LATITUDE83']) fac_nonmiss_geom = fac_nonmiss.apply( lambda x: Point([x['LONGITUDE83'], x['LATITUDE83']]), axis=1) geo_fac_nonmiss = gpd.GeoDataFrame(fac_nonmiss, geometry=fac_nonmiss_geom) geo_fac_nonmiss_zcta = gpd.sjoin(geo_fac_nonmiss, geo_zcta, how="left", op='intersects') geo_fac_nonmiss_acs = pd.merge(geo_fac_nonmiss_zcta, df_acs, how='inner', right_on=['zip code tabulation area'], left_on=['ZCTA5CE10']) fac_miss_zta = pd.merge(fac_miss, zip_to_zta, how='left', on="ZIP_CODE") geo_fac_miss_acs = pd.merge(fac_miss_zta, df_acs, how='inner', right_on=['zip code tabulation area'], left_on=['ZCTA']) entire_fac = pd.concat([geo_fac_nonmiss_acs, geo_fac_miss_acs]) af.drop_features(entire_fac, drop_fac) for key, val in dcat.items(): entire_fac = af.unite_the_perc(key, entire_fac, val) for _, val in unique_cat.items(): af.change_cat(entire_fac, val) for _, val in unique_cat.items(): af.change_cat(entire_fac, val) entire_fac.to_csv('acs_joined.csv') return entire_fac
def test_sjoin_outer(self): df = sjoin(self.pointdf, self.polydf, how="outer") self.assertEquals(df.shape, (21,8))
geofences = gpd.read_file(zipfile) ## load gps points and convert to geopandas dataframe gps_points = pd.read_csv("Nestle_March_Kodigo_GPS-Formatted-v2.csv") gps_points = gpd.GeoDataFrame( gps_points, geometry=gpd.points_from_xy(gps_points["longitude"], gps_points["latitude"]), ) ## add crs to gps_points data gps_points.set_crs(epsg=4326, inplace=True) ### Spatial join points_inside_geofence = gpd.sjoin(gps_points, geofences, how="inner", op="within") # select columns and rename points_inside_geofence = points_inside_geofence.loc[:, [ "plateno", "name", "created_left" ]] points_inside_geofence = points_inside_geofence.rename( columns={ "name": "geofence_name", "created_left": "datestamp" }) # convert to datetime data = points_inside_geofence.sort_values(by="datestamp")
miny, maxy) logger.debug('Where clause for query: {}'.format(where)) count = count_table(danco_lyr, where=where, table=True, noh=noh) logger.debug('Count for table with where clause: {:,}'.format(count)) #%% usfs_fps = gpd.GeoDataFrame() offset = 0 while offset < count: logger.debug('Loading records: {:,} - {:,}'.format(offset, offset+limit)) # Load footprints fps = query_footprint(danco_lyr, where=where, limit=limit, offset=offset, noh=noh) # Intersect to find USFS footprints logger.debug('Identifying records on USFS land...') slice_usfs_fps = gpd.sjoin(fps, usfs, op='within') logger.debug('USFS records found: {}'.format(len(slice_usfs_fps))) # Merge to master dataframe usfs_fps = pd.concat([usfs_fps, slice_usfs_fps]) logger.debug('Total USFS records found: {}'.format(len(usfs_fps))) # Increase offset offset += limit usfs_fps_catids = set(usfs_fps['catalogid']) #%% # Remove onhand IDs # oh = onhand_ids() # mfp_ids = pgc_ids() # usfs_noh = [x for x in usfs_fps_catids if x not in oh] # usfs_nmfp = [x for x in usfs_fps_catids if x not in mfp_ids]
def main(argv): parser = argparse.ArgumentParser('Foursquare mapping to a spatial grid.') parser.add_argument('-i', '--input', help='POIs file with relative coordinates.', action='store', dest='input', required=True, type=str) parser.add_argument('-p', '--prefix', action='store', dest='prefix', help='Prefix for the filename specifying the city name.', required=True, type=str) parser.add_argument('-g', '--grid', help='Input grid for the mapping. If crs is not WGS84, specify it with the param -c', action='store', dest='grid', required=True, type=str) parser.add_argument('-c', '--crs', help='Coordinate Reference System for the input grid. It is requested only if it is different from WGS84.', action='store', dest='crs', default='epsg:4326', type=str) parser.add_argument('-o', '--outputfolder', help='Output folder where to save the mapped file.', action='store', dest='outputfolder', required='True', type=str) parser.add_argument('-lat', '--latitude', help='Latitude name.', action='store', dest='latitude', default='latitude', type=str) parser.add_argument('-long', '--longitude', help='Longitude name.', action='store', dest='longitude', default='longitude', type=str) parser.add_argument('-v', '--verbose', help='Level of output verbosity.', action='store', dest='verbosity', default=0, type=int, nargs="?") args = parser.parse_args() latitude = args.latitude longitude = args.longitude if(args.verbosity == 1): logger.setLevel(logging.INFO) elif(args.verbosity == 2): logger.setLevel(logger.DEBUG) # Load the grid logger.info("Load the grid") gdf = gpd.GeoDataFrame.from_file(args.grid) gdf.crs = {'init': args.crs} if args.crs != 'epsg:4326': gdf = gdf.to_crs({'init': 'epsg:4326'}) # Load POIs logger.info("Load POIs") df = pd.DataFrame(pd.read_csv(args.input, sep=",", low_memory=False)) # Create Point from latitude, longitude pairs and build a GeoDataFrame logger.info("Build geometry") geometry = [Point(xy) for xy in zip(df[longitude], df[latitude])] data = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry=geometry) data.to_crs(gdf.crs, inplace=True) # Check Geometry Validity ans = data.geometry.is_valid invalid = ans[ans == False] data.drop(invalid.index, axis=0, inplace=True) # Spatial Join with the grid to associate each entry to the related cell ('within') - LEFT join = gpd.sjoin(gdf[['cellID', 'geometry']], data, how='left', op='within') # Remove additional columns join.drop(['index_right', 'geometry'], axis=1, inplace=True) # Save output logger.info("Save output file") outputfile = os.path.abspath(os.path.join(args.outputfolder, args.prefix + "_mapped_foursquare_pois.csv")) join.to_csv(outputfile, index=False, sep='\t', float_format='%.6f')
# In[16]: #read shape file state_shp = gpd.read_file('./states_21basic/states.shp') # In[17]: #setting the same coordinate system as london ward. This ensures the points in the datasets aligns with the points #in the shapefile, so that the locations are pinned properly on the map USA_covid_19_point = USA_covid_19_point.to_crs(state_shp.crs) # In[18]: #spatial join of shapefile and acorn data USA_covid_19_join = gpd.sjoin(USA_covid_19_point, state_shp, how="inner", op='intersects') # In[19]: #see output print(USA_covid_19_join.head()) # In[20]: #group states confirmed = pd.DataFrame( USA_covid_19_join[['state', 'index_right', 'confirmed']].groupby('index_right')['confirmed'].sum()) deaths = pd.DataFrame( USA_covid_19_join[['state', 'index_right',
def rasterize(tif, geojson, out, size=DEFAULT_IMG_WINDOW, stride=DEFAULT_STRIDE_WINDOW): """ start rasterizing tif file. here we have window with (args.size) and chunking tif image with size. this image are numpy array which is not stored in disk at this time. Geojson will be applyed on these image chunks with proper transformation for each polygone. chunks and masked images will be used as training dataset for Deep Learning model. tif image can be extra large, tools is able to handle it. Arg: tif - tif image path geojson - geojson file out - output directory size - size of window that user wants, default (h=512, w=512) stride - stride size for sliding window, default same as window """ # read tif rst = rasterio.open(tif) # read geojson scene_labels_gdf = gpd.read_file(geojson) out_tif = os.path.join(out, 'tif') # make tif dir out_mask = os.path.join(out, 'mask') # make mask dir os.makedirs(out_tif, exist_ok=True) os.makedirs(out_mask, exist_ok=True) total = imageUtils.total_chunks_in_image(tif, window=size, stride=stride) with tqdm(total=total, desc='Masking progress') as pbar: # get image from sliding window for each in imageUtils.get_image_chunks(tif, window_size=size, stride=stride): # for win, arr in get_image_chunks(rst, window_size=(win_sz, win_sz)): img_window, img_arr, index = each[0], each[1], each[2] pbar.n = index pbar.refresh() # 'miny', 'maxx', and 'maxy' # (807592.8560103609, 620885.095643373, 807611.1959577247, 620903.4357975163) bounds = rasterio.windows.bounds(img_window, rst.meta['transform']) # shapely.geometry.polygon.Polygon for full image chunk (512, 512) in tiff win_box = box(*bounds) win_box_gdf = gpd.GeoDataFrame(geometry=[win_box], crs=rst.meta['crs']) win_box_gdf = win_box_gdf.to_crs(CRS.from_epsg(4326)) try: # get chip from geopanda and bouning boxes with csr: 4326(Marcater) gdf_chip = gpd.sjoin(scene_labels_gdf, win_box_gdf, how='inner', op='intersects') except AttributeError as ae: pass # check if chip has data if not gdf_chip.empty: burn_val = 255 shapes = [(geom, burn_val) for geom in gdf_chip.geometry] # transform chip_tfm = rasterio.transform.from_bounds( *win_box_gdf.bounds.values[0], *size) label_arr = rasterio.features.rasterize(shapes, out_shape=size, transform=chip_tfm) img_tif_name = os.path.join(out_tif, str(index)) # make tif dir img_mask_name = os.path.join(out_mask, str(index)) # make mask dir # change dimension of chip image to save as tif win_arr = np.moveaxis(img_arr, 0, 2) tiffUtils.save_as_tif(win_arr, chip_tfm=chip_tfm, name=img_tif_name) # save mask image as png imageUtils.save_as_png(label_arr, img_mask_name)
def aggregateByGrid(df, field, summary, gridSize): """ Aggregates the specified field with chosen summary type and user defined grid size. returns aggregated grids with summary Parameters ---------- df : geopandas dataframe field : string field to be summarized. summary : string type of summary to be sumarized. eg. min, max,sum, median gridSize : float the size of grid on same unit as geodataframe coordinates. Returns ------- geodataframe Aggregated grids with summary on it """ def round_down(num, divisor): return floor(num / divisor) * divisor def round_up(num, divisor): return ceil(num / divisor) * divisor # Get crs from data sourceCRS = df.crs targetCRS = "EPSG:3857" # Reproject to Mercator\ df = df.to_crs(targetCRS) # Get bounds xmin, ymin, xmax, ymax = df.total_bounds print(xmin, ymin, xmax, ymax) height, width = gridSize, gridSize top, left = round_up(ymax, height), round_down(xmin, width) bottom, right = round_down(ymin, height), round_up(xmax, width) rows = int((top - bottom) / height)+1 cols = int((right - left) / width)+1 XleftOrigin = left XrightOrigin = left + width YtopOrigin = top YbottomOrigin = top - height polygons = [] for i in range(cols): Ytop = YtopOrigin Ybottom = YbottomOrigin for j in range(rows): polygons.append(Polygon([(XleftOrigin, Ytop), (XrightOrigin, Ytop), (XrightOrigin, Ybottom), (XleftOrigin, Ybottom)])) Ytop = Ytop - height Ybottom = Ybottom - height XleftOrigin = XleftOrigin + width XrightOrigin = XrightOrigin + width grid = gpd.GeoDataFrame({'geometry': polygons}) grid.crs = df.crs # Assign gridid numGrid = len(grid) grid['gridId'] = list(range(numGrid)) # Identify gridId for each point points_identified = gpd.sjoin(df, grid, op='within') # group points by gridid and calculate mean Easting, # store it as dataframe # delete if field already exists if field in grid.columns: del grid[field] grouped = points_identified.groupby('gridId')[field].agg(summary) grouped_df = pd.DataFrame(grouped) new_grid = grid.join(grouped_df, on='gridId').fillna(0) grid = new_grid.to_crs(sourceCRS) summarized_field = summary+"_"+field final_grid = grid.rename(columns={field: summarized_field}) final_grid = final_grid[final_grid[summarized_field] > 0].sort_values( by=summarized_field, ascending=False) final_grid[summarized_field] = round(final_grid[summarized_field], 1) final_grid['x_centroid'], final_grid['y_centroid'] = \ final_grid.geometry.centroid.x, final_grid.geometry.centroid.y return final_grid
locations_df = gpd.GeoDataFrame(locations_df, geometry=geom) locations_df = locations_df[['location_id', 'geometry']] locations_df.crs = {'init': 'epsg:4326'} print(' Creating locations data frame [DONE]') # Read in the County boundaries print(' Reading county shapefile', end="\r", flush=True) counties_df = gpd.read_file('gz_2010_us_050_00_500k.shp') counties_df = counties_df.to_crs(locations_df.crs) counties_df['county_fips'] = counties_df['STATE'] + counties_df['COUNTY'] counties_df['state_fips'] = counties_df['STATE'] counties_df = counties_df[['county_fips', 'state_fips', 'geometry']] print(' Reading county shapefile [DONE]') print(' Preforming spatial join', end="\r", flush=True) gdf = gpd.sjoin(locations_df, counties_df, op='within') gdf.reset_index(inplace=True) gdf = gdf[['location_id', 'state_fips', 'county_fips']] gdf.to_sql('location', con=db, index=False) print(' Preforming spatial join [DONE]') locations = gdf.set_index('location_id').to_dict('index') # Get the list of location ids we need to search for location_ids = list(gdf.location_id) try: c.execute('SELECT * FROM patent_location LIMIT 1') except: c.execute('CREATE TABLE patent_location (patent_id text, location_id text)') values_to_insert = list()
def spatioTemporalAggregation(df, field, summary, gridSize): """ Aggregates the given field on hour and weekday basis. Prepares data for mosaic plot FOR THIS TO WORK YOU NEED TO INSTALL RTree or Rtree-linux!!! # TODO This function is poorly performing Parameters ---------- df : geopandas dataframe field : string field to be summarized. summary : string type of summary to be sumarized. eg. min, max,sum, median gridSize : float the size of grid on same unit as geodataframe coordinates. Returns ------- geodataframes: one each for larger grid and other for subgrids (for visualization purpose only) Aggregated grids with summary on it """ def round_down(num, divisor): return floor(num / divisor) * divisor def round_up(num, divisor): return ceil(num / divisor) * divisor # Get crs from data sourceCRS = df.crs targetCRS = "epsg:3857" # Reproject to Mercator\ df = df.to_crs(targetCRS) # Get bounds xmin, ymin, xmax, ymax = df.total_bounds height, width = gridSize, gridSize top, left = round_up(ymax, height), round_down(xmin, width) bottom, right = round_down(ymin, height), round_up(xmax, width) rows = int((top - bottom) / height)+1 cols = int((right - left) / width)+1 XleftOrigin = left XrightOrigin = left + width YtopOrigin = top YbottomOrigin = top - height polygons = [] for i in range(cols): Ytop = YtopOrigin Ybottom = YbottomOrigin for j in range(rows): polygons.append(Polygon( [(XleftOrigin, Ytop), (XrightOrigin, Ytop), (XrightOrigin, Ybottom), (XleftOrigin, Ybottom)])) Ytop = Ytop - height Ybottom = Ybottom - height XleftOrigin = XleftOrigin + width XrightOrigin = XrightOrigin + width grid = gpd.GeoDataFrame({'geometry': polygons}) grid.crs = (targetCRS) # Assign gridid numGrid = len(grid) grid['gridId'] = list(range(numGrid)) # Identify gridId for each point df['hour'] = df['time'].apply( lambda x: datetime.datetime.strptime( x, '%Y-%m-%dT%H:%M:%S+00:00')).dt.hour df['weekday'] = df['time'].apply( lambda x: datetime.datetime.strptime( x, '%Y-%m-%dT%H:%M:%S+00:00')).dt.dayofweek # df['hour'] = pd.to_datetime(df['time']).dt.hour # df['weekday'] = pd.to_datetime(df['time']).dt.dayofweek points_identified = gpd.sjoin(df, grid, op='within') # group points by gridid and calculate mean Easting, # store it as dataframe # delete if field already exists if field in grid.columns: del grid[field] # Aggregate by weekday, hour and grid grouped = points_identified.groupby( ['gridId', 'weekday', 'hour']).agg({field: [summary]}) grouped = grouped.reset_index() grouped.columns = grouped.columns.map("_".join) modified_fieldname = field+"_"+summary # Create Subgrids subgrid, mainGrid, rowNum, columnNum, value = [], [], [], [], [] unikGrid = grouped['gridId_'].unique() print('running; wait till you see "finished"') for currentGrid in unikGrid: dataframe = grid[grid['gridId'] == currentGrid] xmin, ymin, xmax, ymax = dataframe.total_bounds xminn, xmaxx, yminn, ymaxx = xmin + \ (xmax-xmin)*0.05, xmax-(xmax-xmin)*0.05, ymin + \ (ymax-ymin)*0.05, ymax-(ymax-ymin)*0.05 rowOffset = (ymaxx-yminn)/24.0 colOffset = (xmaxx - xminn)/7.0 tmp = (grouped['gridId_'] == currentGrid) for i in range(7): tmp2=(grouped['weekday_'] == i) for j in range(24): topy, bottomy, leftx, rightx = ymaxx-j*rowOffset, ymaxx - \ (j+1)*rowOffset, xminn+i * \ colOffset, xminn+(i+1)*colOffset subgrid.append( Polygon([(leftx, topy), (rightx, topy), (rightx, bottomy), (leftx, bottomy)])) mainGrid.append(currentGrid) rowNum.append(j) columnNum.append(i) if len(grouped[tmp & tmp2 & (grouped['hour_'] == j)]) != 0: this_value = grouped[ tmp & tmp2 & (grouped['hour_'] == j)].iloc[0][ modified_fieldname] value.append(this_value) else: value.append(np.nan) subgrid_gpd = gpd.GeoDataFrame({'geometry': subgrid}) subgrid_gpd.crs = targetCRS # Reproject to Mercator\ subgrid_gpd = subgrid_gpd.to_crs(sourceCRS) subgrid_gpd['gridId'] = mainGrid subgrid_gpd['Weekday'] = columnNum subgrid_gpd['hour'] = rowNum subgrid_gpd['gridId'] = subgrid_gpd.apply(lambda x: str( x['gridId'])+"_"+str(x['Weekday'])+"_"+str(x['hour']), axis=1) subgrid_gpd[modified_fieldname] = value subgrid_gpd = subgrid_gpd.dropna() grid = grid.to_crs(sourceCRS) grid = grid[grid['gridId'].isin(unikGrid)] print('finished') return grid, subgrid_gpd
def main(): logging.config.fileConfig(fname=os.path.join('config', 'log.config'), disable_existing_loggers=False) # Get the logger specified in the file f_handler = logging.FileHandler(os.path.join('logs', 'generate_fusiun.log')) f_handler.setLevel(logging.DEBUG) log = logging.getLogger(__name__) f_format = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') f_handler.setFormatter(f_format) log.addHandler(f_handler) parser = argparse.ArgumentParser() parser.add_argument( '-p', '--prob-minimum', default=0.4, dest='prob_minimum', help='Minimum probability to be considered as fire grids') parser.add_argument('-s', '--start-date', dest='start_date_str', help='Start date') parser.add_argument('-e', '--end-date', dest='end_date_str', help='End date') parser.add_argument('--ahi-hotspot-folder', dest='ahi_folder', default=os.path.join('..', 'data', 'raw', 'hotspots', 'ahi'), help='AHI hotspot folder') parser.add_argument('--viirs-hotspot-folder', dest='viirs_folder', default=os.path.join('..', 'data', 'raw', 'hotspots', 'viirs'), help='VIIRS hotspot folder') parser.add_argument('--modis-hotspot-folder', dest='modis_folder', default=os.path.join('..', 'data', 'raw', 'hotspots', 'modis'), help='MODIS hotspot folder') parser.add_argument('-o', '--output-folder', dest="out_file_path", default=os.path.join('..', 'data'), help="Specify output folder") parser.add_argument('-g', '--grid', dest="grid_shp", default=os.path.join( '..', 'references', 'shapefile', '2km_grid_ASEAN_land_clipped.shp'), help="Specify grid .shp file") parser.add_argument('-n', '--name', dest="prefix_name", default='FUSIUN_NRT_2km_', help="Prefix for output file names") args = parser.parse_args() log.debug(args) prob_minimum = args.prob_minimum with open(os.path.join('config', 'config.json'), "r") as read_file: json_config = json.load(read_file) clipping_box = json_config['parameters']['clipping_box'] sat_resolution_meter = json_config['sat_resolution_meter'] shapefile_path = json_config['shapefile']['path'] bounding_box = json_config['plotting']['bounding_box'] dpi = json_config['plotting']['dpi'] fusiun_ml_model_fpath = json_config['fusiun_ml_model']['path'] fusiun_predict_features = json_config['fusiun_ml_model'][ 'predict_features'] h8_ml_model_fpath = json_config['h8_ml_model']['path'] h8_predict_features = json_config['h8_ml_model']['predict_features'] low_prob_thres = json_config['parameters']['low_prob_thres'] med_prob_thres = json_config['parameters']['med_prob_thres'] #read in grid shapefile try: df_grid = geopandas.read_file(args.grid_shp) df_grid.crs = {'init': 'epsg:3857'} log.debug(args.grid_shp + ' loaded successfully!') except Exception as e: log.error(args.grid_shp + ' cannot be loaded !') exit() start_date = datetime.strptime(args.start_date_str, "%Y-%m-%d %H:%M") date_process = start_date end_date = datetime.strptime(args.end_date_str, "%Y-%m-%d %H:%M") geo_hs = geohotspot.GeoHotspot() log.info('Reading hotspot .txt files') while date_process <= (end_date + timedelta(days=1)): h8_files = os.path.join( args.ahi_folder, "H08_*" + date_process.strftime('%Y%m%d_') + "*_L2WLFbet_FLDK.06001_06001.csv") geo_hs.parse_jaxa_hotspot_txt(file_path=h8_files) jp1_files = os.path.join(args.viirs_folder, date_process.strftime('%Y%m%d') + "*JP1*.txt") geo_hs.parse_viirs_afedr_txt(file_path=jp1_files, sat_name='NOAA20') npp_files = os.path.join(args.viirs_folder, date_process.strftime('%Y%m%d') + "*NPP*.txt") geo_hs.parse_viirs_afedr_txt(file_path=npp_files, sat_name='NPP') modis_files = os.path.join( args.modis_folder, "*14." + date_process.strftime('%y%j') + "*.txt") geo_hs.parse_modis_mod14_txt(file_path=modis_files) date_process = date_process + timedelta(days=1) # remove hotspots outside of clipping area geo_hs.clip_hotspot(clipping_box) # reject hotspots due to sun glint #geo_hs.reject_sunglint_hs('Himawari-8/9', max_sunglint_angle) geo_df = geo_hs.hs_df.copy() geo_df['aqua_weight'] = 0.0 geo_df['terra_weight'] = 0.0 geo_df['n20_weight'] = 0.0 geo_df['npp_weight'] = 0.0 geo_df['geo_weight'] = 0.0 geo_df['confidence'] = geo_df['confidence'].fillna(0) geo_df.astype({'geo_weight': 'float64', 'confidence': 'float64'}) geo_df['date'] = pd.to_datetime(geo_df['date'], format="%d/%m/%Y %H:%M:%S") # selects period of interest geo_df = geo_df[(geo_df['date'] >= start_date) & (geo_df['date'] <= end_date)] log.debug(geo_df['date'].unique()) log.debug(geo_df[['satellite', 'date']].groupby(['satellite']).count()) try: h8_ml_model = load(h8_ml_model_fpath) log.debug('Loaded trained H8 ML model from ' + h8_ml_model_fpath) log.debug(f'Model pipeline: {h8_ml_model}') geo_df.loc[geo_df['satellite'] == 'Himawari-8/9', 'geo_weight'] = h8_ml_model.predict_proba( geo_df.loc[geo_df['satellite'] == 'Himawari-8/9', h8_predict_features])[:, 1] log.info('Added in probabilities using H8 Gradient Boosting Model.') except Exception as e: log.exception(e) geo_df.loc[geo_df['satellite'] == 'TERRA', 'terra_weight'] = \ geo_df.loc[geo_df['satellite'] == 'TERRA', 'confidence'] / 100.0 geo_df.loc[geo_df['satellite'] == 'AQUA', 'aqua_weight'] = \ geo_df.loc[geo_df['satellite'] == 'AQUA', 'confidence'] / 100.0 geo_df.loc[geo_df['satellite'] == 'JP1_LATE', 'n20_weight'] = \ geo_df.loc[geo_df['satellite'] == 'JP1_LATE', 'confidence'] / 100.0 geo_df.loc[geo_df['satellite'] == 'NPP_LATE', 'npp_weight'] = \ geo_df.loc[geo_df['satellite'] == 'NPP_LATE', 'confidence'] / 100.0 # count number of Himawari observations geo_obs_count = int((end_date - start_date).seconds / 600) # normalize the weight for Himawari geo_df['geo_weight'] = geo_df['geo_weight'] / geo_obs_count # round to 8 decimals to save storage geo_df = geo_df.round(8) try: gdf = geopandas.GeoDataFrame(geo_df, geometry=geopandas.points_from_xy( geo_df.lon, geo_df.lat)) log.debug('Created geopandas DataFrame') except Exception as e: log.exception(e) # transform to mercator epsg 3857 gdf.crs = {'init': 'epsg:4326'} gdf_merc = gdf.to_crs({'init': 'epsg:3857'}) gdf_merc.reset_index(inplace=True, drop=True) gdf_merc['x'] = gdf_merc['geometry'].x gdf_merc['y'] = gdf_merc['geometry'].y for key, value in sat_resolution_meter.items(): gdf_merc.loc[gdf_merc['satellite'] == key, 'resolution_meter'] = value try: interim_file_path = os.path.join(args.out_file_path, 'interim') os.makedirs(interim_file_path, exist_ok=True) except Exception as e: log.exception(e) log.warning(interim_file_path + ' directory cannot be created!') try: processed_file_path = os.path.join(args.out_file_path, 'processed') os.makedirs(processed_file_path, exist_ok=True) except Exception as e: log.exception(e) log.warning(processed_file_path + ' directory cannot be created!') try: hotspot_json = os.path.join( interim_file_path, args.prefix_name + 'hotspot_' + end_date.strftime('%Y%m%d') + '.geojson') gdf_merc.to_file(hotspot_json, driver='GeoJSON') log.info(hotspot_json + ' is saved successfully.') except Exception as e: log.exception(e) log.warning(hotspot_json + ' export warning!') # create polygon for index, row in gdf_merc.iterrows(): gdf_merc['geometry'].iloc[index] = get_poly_box( row['x'], row['y'], row['resolution_meter']) try: hotspot_polygon_json = os.path.join( interim_file_path, args.prefix_name + 'hotspot_polygon_' + end_date.strftime('%Y%m%d') + '.geojson') gdf_merc.round(4) gdf_merc.to_file(hotspot_polygon_json, driver='GeoJSON') log.info(hotspot_polygon_json + ' is saved successfully.') except Exception as e: log.exception(e) log.warning(hotspot_polygon_json + ' export warning!') # for debugging # hotspot_polygon_json = os.path.join(interim_file_path, args.prefix_name + # 'hotspot_polygon_' + end_date.strftime('%Y%m%d') + '.geojson') # gdf_merc = geopandas.read_file(hotspot_polygon_json) try: log.debug('Processing grid sjoin...') df_grid_joined = geopandas.sjoin(df_grid, gdf_merc, op='intersects') grid_weight_total = df_grid_joined[[ 'id', 'geo_weight', 'terra_weight', 'aqua_weight', 'n20_weight', 'npp_weight' ]].groupby(['id']).sum() grid_geometry = df_grid_joined[['id', 'geometry']].groupby(['id']).first() processed_grid = pd.merge(grid_weight_total, grid_geometry, on='id') processed_grid_gpd = geopandas.GeoDataFrame(processed_grid) processed_grid_gpd.crs = {'init': 'epsg:3857'} log.debug('Processing grid completed.') except Exception as e: log.exception(e) log.error('Unable to process grid sjoin!') try: fusiun_ml_model = load(open(fusiun_ml_model_fpath, 'rb')) log.debug('Loaded trained model from ' + fusiun_ml_model_fpath) log.debug(f'Model pipeline: {fusiun_ml_model}') processed_grid_gpd['prob'] = fusiun_ml_model.predict_proba( processed_grid_gpd[fusiun_predict_features])[:, 1] log.info( 'Probabilities filled using FUSIUN Logistic Regression model.') except Exception as e: log.exception(e) try: hotspot_grid_json = os.path.join( processed_file_path, args.prefix_name + 'hotspot_grid_' + end_date.strftime('%Y%m%d') + '.geojson') processed_grid_gpd.to_file(hotspot_grid_json, driver='GeoJSON') log.info(processed_grid_gpd + ' is saved successfully.') except Exception as e: log.warning(hotspot_grid_json + ' export warning!') try: ann_file = os.path.join( processed_file_path, args.prefix_name + 'hotspot_grid_' + end_date.strftime('%Y%m%d') + '.ann') save_fred_grid_meteor_ann(processed_grid_gpd, ann_file, low_prob_thres, med_prob_thres) log.info(ann_file + ' is saved successfully.') except Exception as e: log.exception(e) log.warning(ann_file + ' cannot be saved!')
# iv: clean dataframe: weather_model_df = weather_model_df[ weather_model_df["LATITUDE"] != "unknown" ] weather_model_df = weather_model_df[ weather_model_df["LONGITUDE"] != "unknown" ] weather_model_df = weather_model_df[ weather_model_df["LONGITUDE"].notnull() ] weather_model_df['LATITUDE'] = weather_model_df['LATITUDE'].apply( lambda x: float(x) ) weather_model_df['LONGITUDE'] = weather_model_df['LONGITUDE'].apply( lambda x: float(x) ) # ________________________________ # v: map latitude, longtitude to county: gpd_file = gpd.read_file( "/Users/Hsieh/Desktop/persephone/Data/uscounties.geojson" ) geo_series = weather_model_df.apply(get_county, axis=1) gpd_df = gpd.GeoDataFrame(geometry=geo_series) counties_df = gpd.sjoin(gpd_df, gpd_file, op="within") weather_model_df['COUNTY'] = counties_df['name'] weather_model_df['STATE'] = counties_df['state_name'] # vi: write cleaned df to csv file: weather_model_df.to_csv( 'cleaned_master_weather_complete.csv' )
def dir_centerline(links, nodes, meshpolys, meshlines, Imask, gt, pixlen): """ Guess flow directions of links in a braided river channel. Guesses the flow direction of links in a braided river channel network by exploiting a "valleyline" centerline. Two metrics are computed to help guess the correct direction. The first is the number of centerline transects (meshlines) that the link crosses. The second is the local angle of the centerline compared to the link's angle. These metrics are appended to the links dictionary as links['cldist'] and links['clangs']. Parameters ---------- links : dict Network links and associated properties. nodes : dict Network nodes and associated properties. meshpolys : list List of shapely.geometry.Polygons that define the valleyline mesh. meshlines : list List of shapely.geometry.LineStrings that define the valleyline mesh. Imask : np.array Binary mask of the network. gt : tuple gdal-type GeoTransform of the original binary mask. pixlen : float Length resolution of each pixel. Returns ------- links : dict Network links and associated properties with 'cldists' and 'clangs' attributes appended. """ # alg = 20 alg = dy.algmap('cl_dist_guess') # Create geodataframes for intersecting meshpolys with nodes mp_gdf = gpd.GeoDataFrame(geometry=[Polygon(mp) for mp in meshpolys]) rc = np.unravel_index(nodes['idx'], Imask.shape) nodecoords = gu.xy_to_coords(rc[1], rc[0], gt) node_gdf = gpd.GeoDataFrame( geometry=[Point(x, y) for x, y in zip(nodecoords[0], nodecoords[1])], index=nodes['id']) # Determine which meshpoly each node lies within intersect = gpd.sjoin(node_gdf, mp_gdf, op='intersects', rsuffix='right') # Compute guess and certainty, where certainty is how many transects apart # the link endpoints are (longer=more certain) cldists = np.zeros((len(links['id']), 1)) for i, lconn in enumerate(links['conn']): try: first = intersect.loc[lconn[0]].index_right second = intersect.loc[lconn[1]].index_right cldists[i] = second - first except KeyError: pass for i, c in enumerate(cldists): if c != 0: if c > 0: links['guess'][i].append(links['conn'][i][0]) links['guess_alg'][i].append(alg) elif c < 0: links['guess'][i].append(links['conn'][i][-1]) links['guess_alg'][i].append(alg) # Save the distances for certainty links['cldists'] = np.abs(cldists) # Compute guesses based on how the link aligns with the local centerline # direction # alg = 21 alg = dy.algmap('cl_ang_guess') clangs = np.ones((len(links['id']), 1)) * np.nan for i, (lconn, lidx) in enumerate(zip(links['conn'], links['idx'])): # Get coordinates of link endpoints rc = np.unravel_index([lidx[0], lidx[-1]], Imask.shape) try: # Try is because some points may not lie within the mesh polygons # Get coordinates of centerline midpoints first = intersect.loc[lconn[0]].index_right second = intersect.loc[lconn[1]].index_right if first > second: first, second = second, first first_mp = np.mean(np.array(meshlines[first]), axis=0) # midpoint second_mp = np.mean(np.array(meshlines[second + 1]), axis=0) # midpoint except KeyError: continue # Centerline vector cl_vec = second_mp - first_mp cl_vec = cl_vec / np.sqrt(np.sum(cl_vec**2)) # Link vectors - as-is and flipped (reversed) link_vec = dy.get_link_vector(links, nodes, links['id'][i], Imask.shape, pixlen=pixlen) link_vec_rev = -link_vec # Compute interior radians between centerline vector and link vector # (then again with link vector flipped) lva = np.math.atan2(np.linalg.det([cl_vec, link_vec]), np.dot(cl_vec, link_vec)) lvar = np.math.atan2(np.linalg.det([cl_vec, link_vec_rev]), np.dot(cl_vec, link_vec_rev)) # Save the maximum angle clangs[i] = np.min(np.abs([lva, lvar])) # Make a guess; smaller interior angle (i.e. link direction that aligns # best with local centerline direction) guesses the link orientation if np.abs(lvar) < np.abs(lva): links['guess'][i].append(links['conn'][i][1]) links['guess_alg'][i].append(alg) else: links['guess'][i].append(links['conn'][i][0]) links['guess_alg'][i].append(alg) links['clangs'] = clangs return links
def __init__(self, tessellation, edges, buildings, id_name, unique_id, verbose=True): self.tessellation = tessellation self.edges = edges self.buildings = buildings self.id_name = id_name self.unique_id = unique_id if id_name in buildings.columns: raise ValueError( "'{}' column cannot be in the buildings GeoDataFrame".format( id_name)) cells_copy = tessellation[[unique_id, "geometry"]].copy() print("Buffering streets...") if verbose else None street_buff = edges.copy() street_buff["geometry"] = street_buff.buffer(0.1) print("Generating spatial index...") if verbose else None streets_index = street_buff.sindex print("Difference...") if verbose else None new_geom = [] for ix, cell in tqdm( cells_copy.geometry.iteritems(), total=cells_copy.shape[0], disable=not verbose, ): possible_matches_index = streets_index.query(cell) possible_matches = street_buff.iloc[possible_matches_index] new_geom.append(cell.difference(possible_matches.unary_union)) print("Defining adjacency...") if verbose else None blocks_gdf = gpd.GeoDataFrame(geometry=new_geom) blocks_gdf = blocks_gdf.explode().reset_index(drop=True) spatial_weights = libpysal.weights.Queen.from_dataframe( blocks_gdf, silence_warnings=True) patches = {} jID = 1 for idx in tqdm(blocks_gdf.index, total=blocks_gdf.shape[0], disable=not verbose): # if the id is already present in courtyards, continue (avoid repetition) if idx in patches: continue else: to_join = [idx ] # list of indices which should be joined together neighbours = [] # list of neighbours neighbours += spatial_weights.neighbors[ idx] # neighbours from spatial weights for n in neighbours: while ( n not in to_join ): # until there is some neighbour which is not in to_join to_join.append(n) neighbours += spatial_weights.neighbors[ n] # extend neighbours by neighbours of neighbours :) for b in to_join: patches[b] = jID # fill dict with values jID = jID + 1 blocks_gdf["patch"] = blocks_gdf.index.map(patches) print("Defining street-based blocks...") if verbose else None blocks_single = blocks_gdf.dissolve(by="patch") blocks_single.crs = buildings.crs blocks_single["geometry"] = blocks_single.buffer(0.1) print("Defining block ID...") if verbose else None # street based blocks_single[id_name] = range(len(blocks_single)) print("Generating centroids...") if verbose else None buildings_c = buildings.copy() buildings_c["geometry"] = buildings_c.representative_point( ) # make points print("Spatial join...") if verbose else None centroids_tempID = gpd.sjoin(buildings_c, blocks_single, how="left", op="intersects") tempID_to_uID = centroids_tempID[[unique_id, id_name]] print("Attribute join (tesselation)...") if verbose else None cells_copy = cells_copy.merge(tempID_to_uID, on=unique_id, how="left") print("Generating blocks...") if verbose else None blocks = cells_copy.dissolve(by=id_name) print("Multipart to singlepart...") if verbose else None blocks = blocks.explode() blocks.reset_index(inplace=True, drop=True) blocks["geometry"] = blocks.exterior blocks[id_name] = range(len(blocks)) blocks["geometry"] = blocks.apply(lambda row: Polygon(row.geometry), axis=1) # if polygon is within another one, delete it sindex = blocks.sindex inp, res = sindex.query_bulk(blocks.geometry, predicate="within") res = res[~(inp == res)] mask = np.ones(len(blocks.index), dtype=bool) mask[list(set(res))] = False blocks = blocks.loc[mask] self.blocks = blocks[[id_name, "geometry"]] centroids_w_bl_ID2 = gpd.sjoin(buildings_c, self.blocks, how="left", op="intersects") bl_ID_to_uID = centroids_w_bl_ID2[[unique_id, id_name]] print("Attribute join (buildings)...") if verbose else None buildings_m = buildings[[unique_id]].merge(bl_ID_to_uID, on=unique_id, how="left") self.buildings_id = buildings_m[id_name] print("Attribute join (tesselation)...") if verbose else None cells_m = tessellation[[unique_id]].merge(bl_ID_to_uID, on=unique_id, how="left") self.tessellation_id = cells_m[id_name]
def split_on_poly_by_streetname(square_id): ''' ''' logging.info(f"working on shape_id: {square_id}") try: select_square = squares[squares.loc[:, 'SQUARE'] == square_id].copy() if (select_square.geometry.type == 'MultiPolygon').any(): select_square = select_square.explode() address_pts = addresses[addresses.loc[:, 'SQUARE'] == square_id].copy() square_part = 1 for index, one_square in select_square.iterrows(): one_square_shape = one_square['geometry'] address_within = address_pts[address_pts.within( one_square_shape)].copy() if len(address_within) < 4: split_type = "equal_area" poly_shapes_df = split_poly_into_equal_parts( one_square_shape, 4) poly_shapes_partial = gpd.sjoin( poly_shapes_df, address_within[['SQUARE', 'SSL', 'STNAME', 'geometry']], how='left') poly_shapes_partial.loc[:, 'SQUARE_PART'] = square_part poly_shapes_partial = poly_shapes_partial[[ 'group', 'geometry', 'SQUARE', 'SSL', 'STNAME', 'SQUARE_PART' ]] else: split_type = "streetname_breakdown" address_pts_array = np.array([ coords for coords in address_within.geometry.apply( lambda x: (x.x, x.y)) ]) poly_shapes, pts, poly_to_pt_assignments = voronoi_regions_from_coords( address_pts_array, one_square_shape) poly_shapes_df = gpd.GeoDataFrame(pd.DataFrame( poly_to_pt_assignments, columns=['group']), crs="EPSG:4326", geometry=poly_shapes) poly_shapes_w_stname = gpd.sjoin( poly_shapes_df, address_within[['SQUARE', 'SSL', 'STNAME', 'geometry']]) poly_shapes_partial = poly_shapes_w_stname.dissolve( by='STNAME').reset_index() poly_shapes_partial.loc[:, 'SQUARE_PART'] = square_part poly_shapes_partial = poly_shapes_partial[[ 'group', 'geometry', 'SQUARE', 'SSL', 'STNAME', 'SQUARE_PART' ]] if square_part == 1: full_poly_shape_df = poly_shapes_partial.copy() else: full_poly_shape_df = full_poly_shape_df.append( poly_shapes_partial) square_part += 1 except: bad_shape_df = pd.DataFrame( [[0, Polygon([(0, 0), (1, 1), (0, 1)]), square_id, 0]], columns=['group', 'geometry', 'SQUARE', 'SQUARE_PART']) full_poly_shape_df = gpd.GeoDataFrame(bad_shape_df, crs="EPSG:4326", geometry='geometry') return full_poly_shape_df
end = Point(float(end[1]), float(end[0])) # lon, lat points = gpd.GeoSeries([start, end]) # Here I need to converte the line to a geodataframe to make the # merge with attributes route = gpd.GeoDataFrame(interim) route['geometry'] = gpd.GeoSeries(LineString(lon_lat))[0] ## adding ## geometry route.crs = {'init': 'epsg:4326'} # define coords # Match route with attributes of the municipality route_mun = gpd.sjoin(route, slv, how="inner", op='intersects') route_mun['codmun'] = route_mun['COD_MUN4'] # Then replace the interim database. Note that there could be more than # one match interim = route_mun[[ 'circuito', 'latlon0', 'latlon1', 'time', 'dist', 'codmun' ]] # mun identifier # Append to the main dataframe output = output.append(interim) print(i)
def measure_network_density(streets_for_networkd_prj, gross_city_blocks_prj): """ Adds network density (m/ha.) onto a gdf of gross urban blocks Requires a gdf of streets to overlay with the gross city blocks. Streets that are within a gross urban blocks (i.e. do not coincide with its perimeter) have the block id added to them. The length of these streets are then aggregated by block id and their complete length added to the gross city blocks gdf. Half the lenght of the perimeter (i.e. the bounding roads) are then added to the gdf as well and the network density calculated as the sum of these two numbers divided by the gross area of the block. Parameters ---------- streets_for_networ_prj : geodataframe a projected gdf of streets gross_city_blocks_prj: geodataframe a projected gdf of gross city blocks Returns ------- gross_city_blocks_prj GeoDataFrame """ # OSMnx returns some highway values as lists, this converts them to strings streets_for_networkd_prj['highway'] = streets_for_networkd_prj[ 'highway'].apply(lambda x: ', '.join(x) if type(x) is list else x) # make a new gdf which only contains street fragments completely within a gross city block streets_in_gross_blocks = gpd.sjoin(streets_for_networkd_prj, gross_city_blocks_prj, how="inner", op="within") # Write the length of these inner streets into a new column 'inner_streets_m' streets_in_gross_blocks[ 'inner_streets_m'] = streets_in_gross_blocks.length.round(decimals=1) # aggregate the total length of inner streets for each block inner_streets_agg_by_block = streets_in_gross_blocks.groupby( ['city_block_id']).sum().round(decimals=2) # reindex to keep onlt the columns necessary keep_columns = ['inner_streets_m'] inner_streets_agg_by_block = inner_streets_agg_by_block.reindex( columns=keep_columns) # merge the total inner street length onto the gross blocks gross_city_blocks_prj = gross_city_blocks_prj.merge( inner_streets_agg_by_block, how='outer', left_index=True, right_index=True) # Fill NaN with zeroes gross_city_blocks_prj.fillna(0, axis=1, inplace=True) gross_city_blocks_prj[ 'outer_streets_m'] = gross_city_blocks_prj.length.round(decimals=2) gross_city_blocks_prj['gross_area_ha'] = (gross_city_blocks_prj.area / 10000).round(decimals=4) gross_city_blocks_prj['network_density_m_ha'] = ( ((gross_city_blocks_prj['outer_streets_m'] / 2) + (gross_city_blocks_prj['inner_streets_m'])) / ((gross_city_blocks_prj.area / 10000))).round(decimals=2) return gross_city_blocks_prj
# %% queens_top_plants = gf.df_queens_plants.sort_values("gwh", ascending=False).iloc[:5] queens_top_plants_gdf = queens_plants_gdf.rename(columns={ "Plant_Code": "plant_id" }).merge( right=queens_top_plants[["plant_id"]], on=["plant_id"], how="inner", validate="one_to_one", ) queens_top_nbd_gdf = gpd.sjoin(queens_nbd_gdf, queens_top_plants_gdf, how="inner", op="intersects")[["ntaname", "geometry" ]].drop_duplicates() queens_top_nbd_gdf.loc[:, "centroid_lat"] = queens_top_nbd_gdf.geometry.centroid.y queens_top_nbd_gdf.loc[:, "centroid_long"] = queens_top_nbd_gdf.geometry.centroid.x alt.Chart(queens_top_nbd_gdf).mark_geoshape( fill="lightgray", stroke="white") + alt.Chart(queens_top_plants_gdf).mark_geoshape( ) + alt.Chart(queens_top_plants_gdf).mark_text( align="left", baseline="middle").encode( longitude="Longitude", latitude="Latitude", text="Plant_Name",
def tract_links(resource, doc, env, *args, **kwargs): from metapack.rowgenerator import PandasDataframeSource from metapack import get_cache from shapely.geometry import Point import geopandas as gpd # First, geo join the tracts into the communities and cities. comm = doc.resource('cities_communities').geoframe() tracts = doc.resource('tracts').dataframe() tracts['intp'] = tracts.apply( lambda r: Point(float(r.intptlon), float(r.intptlat)), axis=1) tract_pt = gpd.GeoDataFrame(tracts, geometry='intp') tract_community = gpd.sjoin(comm, tract_pt, op='contains') columns = ['geoid', 'type', 'name', 'name_code', 'city', 'link_code'] tc = tract_community.rename({'name_left': 'name'}, axis=1)[columns] # Now link everything together. acronyms = doc.reference('acronyms') acro_map = dict(list(acronyms)[1:]) acro_map[''] = '' _1 = tc[['geoid']].drop_duplicates().set_index('geoid').join( tracts[['geoid', 'geometry']].set_index('geoid')) _2 = tc.set_index('geoid') _3 = _2[_2.type == 'city'][['name', 'name_code']] _4 = _2[_2.type == 'county'][['name', 'name_code']] _5 = _2[_2.type == 'sd_community'][['name', 'name_code']] _6 = _2[_2.type == 'county_community'][['name', 'name_code']] _7 = _1.join(_3, rsuffix='_city').join(_4, rsuffix='_county')\ .join(_5, rsuffix='_sdc').join(_6, rsuffix='_cnc') _7.columns =\ [ 'geometry', 'city_name', 'city_code', 'county_name', 'county_code', 'community_name', 'community_cpcode', 'cnc_name', 'cnc_code'] # Move the county name into the city columns, then drop it _7['city_name'] = _7.city_name.where( ~((_7.city_name.isnull()) & (_7.county_code == 'CN')), 'COUNTY') _7['city_code'] = _7.city_code.where( ~((_7.city_code.isnull()) & (_7.county_code == 'CN')), 'CN') _7.drop(['county_name', 'county_code'], axis=1, inplace=True) # Move the county community names into the community columns _7['community_name'] = _7.community_name.where(_7.city_code != 'CN', _7.cnc_name).fillna('') _7['community_cpcode'] = _7.community_cpcode.where(_7.city_code != 'CN', _7.cnc_code).fillna('0') _7.drop(['cnc_name', 'cnc_code'], axis=1, inplace=True) _7 = _7.fillna('') _7['city_name'] = _7.city_name.apply(lambda v: v.title()) _7['community_name'] = _7.community_name.apply( lambda v: str(v).title()).fillna('') _7['community_code'] = _7.community_name.apply( lambda v: acro_map[clean_comm_name(v)].upper()) # move geometry to the end _7 = _7[list(_7.columns)[1:] + list(_7.columns)[:1]] yield from PandasDataframeSource('<df>', _7, get_cache())
# use coordinates strikes = gp.GeoDataFrame(df_strikes,geometry=gp.points_from_xy(df_strikes.longitude, df_strikes.latitude)).set_crs(epsg=z_epsg_wgs84) #single strike sources strikes_internet = gp.GeoDataFrame(df_internet,geometry=gp.points_from_xy(df_internet.longitude, df_internet.latitude)).set_crs(epsg=z_epsg_wgs84) strikes_cities = gp.GeoDataFrame(df_cities,geometry=gp.points_from_xy(df_cities.longitude, df_cities.latitude)).set_crs(epsg=z_epsg_wgs84) strikes_ordnungsamt = gp.GeoDataFrame(df_ordnungsamt,geometry=gp.points_from_xy(df_ordnungsamt.longitude, df_ordnungsamt.latitude)).set_crs(epsg=z_epsg_wgs84) # merge spatially with kreise & wahlkreise to know in which ags they lie strikes_ags = gp.sjoin(strikes, kreise[['AGS', 'geometry']], how='left', op='within') strikes_ags.drop('index_right', axis=1, inplace=True) strikes_ags = gp.sjoin(strikes_ags, wahlkreise[['WKR_NR', 'geometry']], how='left', op='within') strikes_ags.drop('index_right', axis=1, inplace=True) strikes_ags = gp.sjoin(strikes_ags, teralytics[['FID', 'geometry']]) strikes_ags.drop('index_right', axis=1, inplace=True) strikes_ags.rename(columns={'AGS':'ags5', 'WKR_NR':'wkr_nr', 'FID':'teralytics_id'}, inplace=True)
def test_sjoin_inner(self): df = sjoin(self.pointdf, self.polydf, how="inner") assert df.shape == (11, 8)
def _consolidate_intersections_rebuild_graph(G, tolerance=10, reconnect_edges=True): """ Consolidate intersections comprising clusters of nearby nodes. Merge nodes and return a rebuilt graph with consolidated intersections and reconnected edge geometries. The tolerance argument should be adjusted to approximately match street design standards in the specific street network, and you should always use a projected graph to work in meaningful and consistent units like meters. Returned graph's node IDs represent clusters rather than osmids. Refer to nodes' osmid attributes for original osmids. If multiple nodes were merged together, the osmid attribute is a list of merged nodes' osmids. Parameters ---------- G : networkx.MultiDiGraph a projected graph tolerance : float nodes are buffered to this distance (in graph's geometry's units) and subsequent overlaps are dissolved into a single node reconnect_edges : bool ignored if rebuild_graph is not True. if True, reconnect edges and their geometries in rebuilt graph to the consolidated nodes and update edge length attributes; if False, returned graph has no edges (which is faster if you just need topologically consolidated intersection counts). Returns ------- H : networkx.MultiDiGraph a rebuilt graph with consolidated intersections and reconnected edge geometries """ # STEP 1 # buffer nodes to passed-in distance and merge overlaps. turn merged nodes # into gdf and get centroids of each cluster as x, y node_clusters = gpd.GeoDataFrame( geometry=_merge_nodes_geometric(G, tolerance)) centroids = node_clusters.centroid node_clusters["x"] = centroids.x node_clusters["y"] = centroids.y # STEP 2 # attach each node to its cluster of merged nodes. first get the original # graph's node points then spatial join to give each node the label of # cluster it's within node_points = utils_graph.graph_to_gdfs(G, edges=False)[["geometry"]] gdf = gpd.sjoin(node_points, node_clusters, how="left", op="within") gdf = gdf.drop(columns="geometry").rename( columns={"index_right": "cluster"}) # STEP 3 # if a cluster contains multiple components (i.e., it's not connected) # move each component to its own cluster (otherwise you will connect # nodes together that are not truly connected, e.g., nearby deadends or # surface streets with bridge). groups = gdf.groupby("cluster") for cluster_label, nodes_subset in groups: if len(nodes_subset) > 1: # identify all the (weakly connected) component in cluster wccs = list( nx.weakly_connected_components(G.subgraph(nodes_subset.index))) if len(wccs) > 1: # if there are multiple components in this cluster suffix = 0 for wcc in wccs: # set subcluster xy to the centroid of just these nodes subcluster_centroid = node_points.loc[ wcc].unary_union.centroid gdf.loc[wcc, "x"] = subcluster_centroid.x gdf.loc[wcc, "y"] = subcluster_centroid.y # move to subcluster by appending suffix to cluster label gdf.loc[wcc, "cluster"] = f"{cluster_label}-{suffix}" suffix += 1 # STEP 4 # create new empty graph and copy over misc graph data H = nx.MultiDiGraph() H.graph = G.graph # STEP 5 # create a new node for each cluster of merged nodes # regroup now that we potentially have new cluster labels from step 3 groups = gdf.groupby("cluster") for cluster_label, nodes_subset in groups: osmids = nodes_subset.index.to_list() if len(osmids) == 1: # if cluster is a single node, add that node to new graph H.add_node(cluster_label, **G.nodes[osmids[0]]) else: # if cluster is multiple merged nodes, create one new node to # represent them H.add_node( cluster_label, osmid=str(osmids), x=nodes_subset["x"].iloc[0], y=nodes_subset["y"].iloc[0], ) if not G.edges or not reconnect_edges: # if reconnect_edges is False or there are no edges in original graph # (after dead-end removed), then skip edges and return new graph as-is return H # STEP 6 # create new edge from cluster to cluster for each edge in original graph gdf_edges = utils_graph.graph_to_gdfs(G, nodes=False) for u, v, k, data in G.edges(keys=True, data=True): u2 = gdf.loc[u, "cluster"] v2 = gdf.loc[v, "cluster"] # only create the edge if we're not connecting the cluster # to itself, but always add original self-loops if (u2 != v2) or (u == v): data["u_original"] = u data["v_original"] = v if "geometry" not in data: data["geometry"] = gdf_edges.loc[(u, v, k), "geometry"] H.add_edge(u2, v2, **data) # STEP 7 # for every group of merged nodes with more than 1 node in it, extend the # edge geometries to reach the new node point for cluster_label, nodes_subset in groups: # but only if there were multiple nodes merged together, # otherwise it's the same old edge as in original graph if len(nodes_subset) > 1: # get coords of merged nodes point centroid to prepend or # append to the old edge geom's coords x = H.nodes[cluster_label]["x"] y = H.nodes[cluster_label]["y"] xy = [(x, y)] # for each edge incident to this new merged node, update its # geometry to extend to/from the new node's point coords in_edges = set(H.in_edges(cluster_label, keys=True)) out_edges = set(H.out_edges(cluster_label, keys=True)) for u, v, k in in_edges | out_edges: old_coords = list(H.edges[u, v, k]["geometry"].coords) new_coords = xy + old_coords if cluster_label == u else old_coords + xy new_geom = LineString(new_coords) H.edges[u, v, k]["geometry"] = new_geom # update the edge length attribute, given the new geometry H.edges[u, v, k]["length"] = new_geom.length return H
def write_source_models(version=0, full=False, use_recomputed=False, prefix='nt2012'): ''' Writes all source models. ''' # compute some filenames if use_recomputed: smoothed_data_path = RECOMPUTED_DATA_PATH smoothed_prefix = 'recomputed' else: smoothed_data_path = ORIGINAL_DATA_PATH smoothed_prefix = prefix layers_df = pd.read_csv(LAYERS_FORMAT % version, index_col='layerid') # load electronic supplement for areal zones df_erroneous = pd.read_csv(StringIO('''\ zoneid layerid strike dip rake 14 1 228 69 330 914 1 192 46 124 '''), sep=r'\s+', index_col='zoneid') print('Reading areal polygons and seismicity statistics for each layer') areal_dfs = [] for layer_id in layers_df.index: # read seismicity and polygons and join them seismicity_file = os.path.join(ORIGINAL_DATA_PATH, SEISMICITY_FORMAT % layer_id) print('Reading: ' + os.path.abspath(seismicity_file)) seismicity_df = pd.read_csv(seismicity_file) seismicity_df.set_index('zoneid', inplace=True, verify_integrity=True) seismicity_df.rename(columns=SEISMICITY_ALIASES, inplace=True) # preserve errors in electonic supplement in version v0 if int(version) == 0: if layer_id == 4: (seismicity_df.loc[169], seismicity_df.loc[170]) = \ (seismicity_df.loc[170].copy(), seismicity_df.loc[169].copy()) print('Swapped seismicity parameters for zones 169 and 170.') for zoneid, row in df_erroneous[df_erroneous.layerid == layer_id].iterrows(): row = row.drop('layerid') for column in row.keys(): seismicity_df.loc[zoneid, column] = row[column] print('Restored zone %d erroneous %s: %s' % (zoneid, row.keys().values, row.values)) polygon_file = os.path.join(ORIGINAL_DATA_PATH, POLYGON_FORMAT % layer_id) print('Reading: ' + os.path.abspath(polygon_file)) polygon_df = read_polygons(polygon_file) polygon_df.set_index('zoneid', inplace=True, verify_integrity=True) df = seismicity_df.join(polygon_df, how='outer') # add layer info df.insert(0, 'layerid', layer_id) areal_dfs.append(df) # put it all together columns = list( unique_everseen([column for column in df.columns for df in areal_dfs])) areal_df = pd.concat(areal_dfs, sort=True)[columns].sort_index() # auxiliary information aux_file = AUX_FORMAT % int(version) print('\nReading: ' + os.path.abspath(aux_file)) aux_df = pd.read_csv(aux_file, index_col='zoneid').sort_index() assert (areal_df.index == aux_df.index).all() if 'layerid' in aux_df: aux_df.drop(columns='layerid', inplace=True) areal_df = areal_df.join(aux_df) # assign undefined focal mechanisms as reverse faulting - shouldn't matter undefined = areal_df['dip'] == -1 areal_df.loc[undefined, 'rake'] = 90 areal_df.loc[undefined, 'dip'] = 45 areal_df.loc[undefined, 'strike'] = 0 # augment areal zone description tables areal_df = areal_df.join(layers_df, on='layerid') areal_df['rake'] = wrap(areal_df['rake']) areal_df['mechanism'] = focal_mech(areal_df['dip'], areal_df['rake']) areal_df['new style'] = faulting_style(areal_df['strike'], areal_df['dip'], areal_df['rake']) areal_df['strike2'], areal_df['dip2'], areal_df['rake2'] = zip(*[ aux_plane(strike, dip, rake) for strike, dip, rake in zip( areal_df['strike'], areal_df['dip'], areal_df['rake']) ]) areal_df['mechanism2'] = focal_mech(areal_df['dip2'], areal_df['rake2']) areal_df['mmin'] = MIN_MAGS[0] areal_df['strike2'] = areal_df['strike2'].round(1) areal_df['dip2'] = areal_df['dip2'].round(1) areal_df['rake2'] = areal_df['rake2'].apply(wrap) areal_df['rake2'] = areal_df['rake2'].round(1) areal_df['rake2'] = areal_df['rake2'].apply(wrap) swap = areal_df['focal plane'] == 'secondary' print('Treating %d focal planes as secondary: %s' % (sum(swap), ', '.join(str(item) for item in areal_df.index[swap]))) for column in ['strike', 'dip', 'rake', 'mechanism']: areal_df.loc[swap, [column, column + '2']] = \ areal_df.loc[swap, [column + '2', column]].values # grab mmax and bvalue from zone above if mmax zero for this zone check_keys = ['mmax', 'b'] none_found = True for i, area_series in areal_df[(areal_df[check_keys] == 0).any( axis=1)].iterrows(): alternate_zone = int(area_series.name / 10) for key in check_keys: if area_series['a'] != 0 and area_series[key] == 0: print('For zone %d taking %s from zone %d' % (area_series.name, key, alternate_zone)) areal_df.at[i, key] = areal_df.at[alternate_zone, key] none_found = False if none_found: print('SUCCESS: All zones already have mmax & b defined.') # write areal CSV areal_source_model_base = AREAL_MODEL_FORMAT % (prefix, int(version)) areal2csv(areal_df, areal_source_model_base) # write areal NRML mark = time() df2nrml(areal_df, areal_source_model_base) print('Finished writing areal model to NRML: %s\n' % pd.to_timedelta(time() - mark, unit='s')) # read logic tree description table source_tree_tsv = SOURCE_TREE_FORMAT % int(version) print('Logic tree before collapse:') source_tree_symbolic_df = read_tree_tsv(source_tree_tsv) print(source_tree_symbolic_df) # compute collapsed rates areal_collapsed_df, collapsed_tree_df, _, _ = \ collapse_sources(areal_df, source_tree_symbolic_df) print('Logic tree after collapse:') print(collapsed_tree_df) # write areal sources to NRML mark = time() areal_collapsed_model_base = areal_source_model_base + ' collapsed' df2nrml(areal_collapsed_df, areal_collapsed_model_base) print('Finished writing collapsed areal model to NRML: %s\n' % pd.to_timedelta(time() - mark, unit='s')) # completeness tables print('Reading completeness tables.') completeness_df = pd.read_csv( '../Data/thingbaijam2011seismogenic/Table1.csv', header=[0, 1], index_col=[0, 1]) completeness_df.columns = [ ' '.join(col).strip() for col in completeness_df.columns.values ] # electronic supplement for smoothed-gridded model print('Reading smoothed seismicity data ...') smoothed_data_format = os.path.join(smoothed_data_path, SMOOTHED_FORMAT) mark = time() smoothed_df_list = [] for i, min_mag in enumerate(MIN_MAGS): layer_smoothed_df_list = [] for layer_id, layer in layers_df.join(completeness_df, on=['zmin', 'zmax']).iterrows(): layer_smoothed_df = pd.read_csv(smoothed_data_format % (layer_id, min_mag)) nu_mag = 'nu%s' % str(min_mag).replace('.', '_') rename_cols = {nu_mag: 'nu', 'lat': 'latitude', 'lon': 'longitude'} layer_smoothed_df.rename(columns=rename_cols, inplace=True) layer_smoothed_df['layerid'] = layer_id layer_smoothed_df['mmin model'] = min_mag layer_smoothed_df['mmin'] = min_mag layer_smoothed_df['duration'] = (layer[str(min_mag) + ' end'] - layer[str(min_mag) + ' start'] + 1) if use_recomputed: layer_smoothed_df['lambda'] = layer_smoothed_df['nu'] layer_smoothed_df['nu'] = (layer_smoothed_df['lambda'] * layer_smoothed_df['duration']) else: layer_smoothed_df['lambda'] = (layer_smoothed_df['nu'] / layer_smoothed_df['duration']) layer_smoothed_df_list.append(layer_smoothed_df) layer_smoothed_df = pd.concat(layer_smoothed_df_list, ignore_index=True) smoothed_df_list.append(layer_smoothed_df) smoothed_df = pd.concat(smoothed_df_list, ignore_index=True) len_smoothed = smoothed_df.shape[0] smoothed_df.sort_values(['layerid', 'mmin model', 'longitude', 'latitude']) smoothed_df['geometry'] = [ Point(longitude, latitude) for longitude, latitude in zip( smoothed_df['longitude'], smoothed_df['latitude']) ] smoothed_df = gpd.GeoDataFrame(smoothed_df, crs='WGS84') print('Read %d point sources from %d files: %s\n' % (len(smoothed_df), len(MIN_MAGS) * len(layers_df), pd.to_timedelta(time() - mark, unit='s'))) # associate smoothed-gridded points with zones # we are only interested in active zones active_areal_df = areal_df[areal_df['a'] != 0].reset_index() print('Associate point sources in areal zones with those zones ...') # quick, requires no transformations mark = time() smoothed_df['distance'] = np.inf smoothed_dfs = [] for layer_id in layers_df.index: smoothed_layer_df = smoothed_df[smoothed_df['layerid'] == layer_id] areal_layer_df = gpd.GeoDataFrame( active_areal_df[active_areal_df['layerid'] == layer_id], crs='WGS84') smoothed_layer_df = gpd.sjoin( smoothed_layer_df, areal_layer_df[['zoneid', 'a', 'geometry']], how='left', op='within') smoothed_dfs.append(smoothed_layer_df) smoothed_df = pd.concat(smoothed_dfs) smoothed_df.drop(columns='index_right', inplace=True) smoothed_df['in zoneid'] = smoothed_df['zoneid'].copy() assigned = (~np.isnan(smoothed_df['in zoneid'])) & (smoothed_df['a'] != 0) smoothed_df.loc[assigned, 'distance'] = 0 print('Spatial join accounted for %.2f%% of sources: %s\n' % (100 * len(smoothed_df[assigned]) / len(smoothed_df), pd.to_timedelta(time() - mark, unit='s'))) unassigned_zones = ( set(active_areal_df.zoneid.unique()) - set(smoothed_df[pd.notnull(smoothed_df.zoneid)].zoneid.unique())) if list(unassigned_zones): raise RuntimeError('Zones not assigned to any point: ' + str(sorted(list(unassigned_zones)))) else: print('SUCCESS: All active areal zones assigned to at least one point') # no point should be associated with multiple zones id_columns = ['latitude', 'longitude', 'layerid', 'mmin'] duplicated_df = smoothed_df[smoothed_df.duplicated( subset=id_columns, keep=False)].sort_values(id_columns + ['zoneid']) if duplicated_df.empty: print('SUCCESS: No grid point fell in multiple areal zones') else: duplicated_df.to_csv('smoothed_duplicated.csv') point_a = duplicated_df.iloc[0] point_b = duplicated_df.iloc[1] zone_a = active_areal_df.at[int(point_a.zoneid), 'geometry'] zone_b = active_areal_df.at[int(point_b.zoneid), 'geometry'] _, ax = plt.subplots() ax.add_patch(PolygonPatch(zone_a, alpha=0.5)) ax.add_patch(PolygonPatch(zone_b, alpha=0.5)) ax.scatter(duplicated_df['longitude'], duplicated_df['latitude']) ax.set_xlim((point_a.longitude - 5, point_a.longitude + 5)) ax.set_ylim((point_a.latitude - 5, point_a.latitude + 5)) ax.set_aspect(1) print(int(point_a.zoneid), point_a.layerid, dumps(zone_a, rounding_precision=2)) print(int(point_b.zoneid), point_a.layerid, dumps(zone_b, rounding_precision=2)) print(point_a.longitude, point_a.latitude) raise RuntimeError('Points assigned to multiple zones.') # associate points nearest to zones print('Find nearest areal zones for remaining points ...') mark = time() active_areal_df['polygon'] = [ MyPolygon([ geo.point.Point(lat, lon) for lat, lon in zip(*zone.geometry.exterior.coords.xy) ]) for _, zone in active_areal_df.iterrows() ] unassigned_df = smoothed_df.loc[~assigned].copy() distances = np.full((len(unassigned_df), len(active_areal_df)), np.inf) for i, area_series in active_areal_df.iterrows(): in_layer = (unassigned_df['layerid'] == area_series['layerid']).values mesh = geo.mesh.Mesh(unassigned_df.loc[in_layer, 'longitude'].values, unassigned_df.loc[in_layer, 'latitude'].values) distances[in_layer, i] = area_series['polygon'].distances(mesh) unassigned_df.loc[:, 'zoneid'] = active_areal_df.loc[ np.argmin(distances, axis=1), 'zoneid'].values unassigned_df.loc[:, 'distance'] = np.amin(distances, axis=1) print('Nearest zone required for %.0f%% of sources: %s\n' % (100 * len(unassigned_df) / len(smoothed_df), pd.to_timedelta(time() - mark, unit='s'))) smoothed_df = pd.concat((smoothed_df[assigned], unassigned_df)) # copy parameters of nearest areal zone print('For each point source, copy parameters of nearest areal zone') columns_to_copy = [ 'zoneid', 'zmax', 'zmin', 'hypo_depth', 'tectonic subregion', 'a', 'b', 'stdb', 'mmax', 'stdmmax', 'rake', 'dip', 'strike', 'aspect ratio', 'msr' ] smoothed_df.drop(columns=['a'], inplace=True) smoothed_df = smoothed_df.merge(active_areal_df[columns_to_copy], on='zoneid') smoothed_df['a'] = (np.log10(smoothed_df['lambda']) + smoothed_df['b'] * smoothed_df['mmin model']) assert len_smoothed == smoothed_df.shape[0] # check for unassigned parameters display_drop = [ 'zmax', 'zmin', 'aspect ratio', 'msr', 'rake', 'dip', 'strike', 'stdb', 'stdmmax' ] no_zoneid_df = smoothed_df[smoothed_df['zoneid'].isnull()] no_mmax_df = smoothed_df[smoothed_df['mmax'] == 0] no_b_df = smoothed_df[smoothed_df['b'] == 0] if not no_zoneid_df.empty: print(no_zoneid_df.drop(display_drop, axis=1).head()) RuntimeError("Leftover points with no assigned zone id") if not no_mmax_df.empty: print(no_mmax_df.drop(display_drop, axis=1).head()) RuntimeError("Leftover points with no assigned mmax") if not no_b_df.empty: print(no_b_df.drop(display_drop, axis=1).head()) RuntimeError("Leftover points with no assigned b") if (no_mmax_df.empty and no_b_df.empty and no_zoneid_df.empty): print("SUCCESS: No points with unassigned MFD or zone") else: raise RuntimeError('Unassigned parameters remain.') # Thinning of models allows quick testing and git archiving of a sample res_deg = 1 thinned_df = smoothed_df.loc[ np.isclose(np.remainder(smoothed_df['latitude'], res_deg), 0) & np.isclose(np.remainder(smoothed_df['longitude'], res_deg), 0)].copy( ) print('Thinning to %g° spacing reduces number of points from %d to %d.\n' % (res_deg, len(smoothed_df), len(thinned_df))) # write thinned models mark = time() thinned_base = (SMOOTHED_MODEL_FORMAT.replace('v%d', '') + 'thinned ' + 'v%d') % (smoothed_prefix, int(version)) points2csv(thinned_df, thinned_base) points2nrml(thinned_df, thinned_base) print('Wrote %d thinned smoothed-gridded sources to CSV & NRML: %s\n' % (len(thinned_df), pd.to_timedelta(time() - mark, unit='s'))) thinned_collapsed_df, collapsed_tree_df, _, _ = \ collapse_sources(thinned_df, source_tree_symbolic_df) points2nrml(thinned_collapsed_df, thinned_base + ' collapsed') print( 'Wrote %d collapsed thinned sources to CSV & NRML: %s\n' % (len(thinned_collapsed_df), pd.to_timedelta(time() - mark, unit='s'))) # write full smoothed-gridded models (~10 minutes) if full: mark = time() smoothed_model_base = SMOOTHED_MODEL_FORMAT % (smoothed_prefix, int(version)) points2csv(smoothed_df, smoothed_model_base) points2nrml(smoothed_df, smoothed_model_base, by='mmin model') print('Wrote %d full smoothed-gridded sources to CSV & NRML: %s\n' % (len(smoothed_df), pd.to_timedelta(time() - mark, unit='s'))) # write collapsed smoothed-gridded sources to NRML (~10 minutes) mark = time() smoothed_collapsed_df, collapsed_tree_df, _, _ = \ collapse_sources(smoothed_df, source_tree_symbolic_df) points2nrml(smoothed_collapsed_df, smoothed_model_base + ' collapsed') print( 'Wrote %d collapsed smoothed-gridded sources to CSV & NRML: %s\n' % (len(smoothed_collapsed_df), pd.to_timedelta(time() - mark, unit='s')))
def time_sjoin(self, op): sjoin(self.df1, self.df2, op=op)
corpus = args.corpus data=[] with open(corpus) as f: for i,l in enumerate(f): t=ujson.loads(l) if ("hashtags" in t) : data.append({'geometry':shapely.geometry.Point(t['geo']['longitude'],t['geo']['latitude']), 'melt':t['melt'], 'hashtags':t['hashtags']}) else : data.append({'geometry':shapely.geometry.Point(t['geo']['longitude'],t['geo']['latitude']), 'melt':t['melt']}) allTweets=gpd.GeoDataFrame(data) allTweets = allTweets.where(pd.notnull(allTweets), None) allTweets.crs=departements.crs sys.stdout.write("1/6 - Jointure entre les tweets et les départements\n") allTweets_with_departments = gpd.sjoin(allTweets,departements, how="inner", op='intersects') sys.stdout.write("2/6 - Sauvegarde de la jointure dans le .csv\n") allTweets_with_departments.to_csv('tweets_with_departments.csv') ## freq by departement def freqByDep(annotationType) : freqParDepartement=defaultdict(lambda : defaultdict(int)) for i in range(allTweets_with_departments.shape[0]): percentage = round(i*100/allTweets_with_departments.shape[0]) sys.stdout.write("\r3/6 - Calcul des fréquences - "+str(annotationType)+" : "+str(percentage)+"%") if annotationType == "hashtags" : if (allTweets_with_departments.iloc[i][annotationType] is not None) : for h in allTweets_with_departments.iloc[i][annotationType]: