def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" fields = [ "dc_dist", "dc_key", "dispatch_date_time", "location_block", "psa", "text_general_code", "ucr_general", ] where = "text_general_code LIKE '%Homicide - Criminal%'" gdf = replace_missing_geometries( carto2gpd.get(url, "incidents_part1_part2", fields=fields, where=where)).to_crs(epsg=EPSG) return (gdf.pipe(geocode, ZIPCodes.get()).pipe( geocode, Neighborhoods.get()).pipe(geocode, PoliceDistricts.get()).assign( dispatch_date_time=lambda df: pd.to_datetime( df.dispatch_date_time), year=lambda df: df.dispatch_date_time.dt.year, text_general_code=lambda df: df.text_general_code.str.strip(), time_offset=lambda df: (df.dispatch_date_time - pd.to_datetime( "1/1/2006").tz_localize("UTC")).dt.total_seconds().values, ).sort_values("dispatch_date_time", ascending=False).reset_index(drop=True))
def get_data(days): """ Query the CARTO database to get shootings from the recent past. Parameters ---------- days : int the number of days to get data for Returns ------- gdf : GeoDataFrame the data frame holding the queried data """ # Query for the data URL = "https://phl.carto.com/api/v2/sql" WHERE = f"date_ >= current_date - {days}" gdf = carto2gpd.get(URL, "shootings", where=WHERE) # Add lat/lng columns gdf["lat"] = gdf.geometry.y gdf["lng"] = gdf.geometry.x # Remove rows with missing geo coordinates gdf = gdf.dropna(subset=["lat", "lng"]) return gdf
def covid_realtime(): """ Get real-time accumulated covid data by each zip code. """ url = "https://phl.carto.com/api/v2/sql" gdf = carto2gpd.get(url, "covid_cases_by_zip",fields=['zip_code', 'count', "etl_timestamp"]) return gdf
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" gdf = carto2gpd.get(url, "ppr_tree_canopy_points_2015") return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe( geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
def get_data(days): """ Query the CARTO database to get shootings from the recent past. Parameters ---------- days : int the number of days to get data for Returns ------- gdf : GeoDataFrame the data frame holding the queried data """ # Query for the data URL = "https://phl.carto.com/api/v2/sql" WHERE = f"date_ >= current_date - {days}" gdf = carto2gpd.get(URL, "shootings", where=WHERE) # Re-map the fatal column to Yes/No gdf["fatal"] = gdf["fatal"].map({0: "No", 1: "Yes"}) # Remove entries where fatal is NaN gdf = gdf.dropna(subset=["fatal"]) # Remove missing geometries gdf = gdf.loc[gdf.geometry.notnull()] return gdf
def download(cls, **kwargs): # the raw data fields = [ "dc_dist", "dc_key", "dispatch_date_time", "location_block", "psa", "text_general_code", "ucr_general", ] url = "https://phl.carto.com/api/v2/sql" where = "extract(year from dispatch_date_time) = 2018" gdf = carto2gpd.get(url, "incidents_part1_part2", fields=fields, where=where) return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe( geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get()).pipe( geocode, PUMAs.get()).assign( dispatch_date_time=lambda df: pd.to_datetime( df.dispatch_date_time), year=lambda df: df.dispatch_date_time.dt.year, ).sort_values("dispatch_date_time", ascending=False).reset_index(drop=True))
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" gdf = carto2gpd.get(url, "real_estate_tax_delinquencies") return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe( geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
def download(cls): # todays date today = pd.datetime.today().strftime("%Y-%m-%d") # load all licenses carto_url = "https://phl.carto.com/api/v2/sql" where = f"issuedate < '{today}'" return (carto2gpd.get( carto_url, table_name="li_com_act_licenses", where=where).query("legalentitytype == 'Company'").drop( labels=[ "geometry", "licensetype", "objectid", "revenuecode", "legalentitytype", "legalfirstname", "legallastname", ], axis=1, errors="ignore", ).rename(columns=dict( companyname="company_name", issuedate="issue_date", licensestatus="license_status", licensenum="license_num", )).sort_values(by="issue_date", ascending=True).reset_index(drop=True))
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" gdf = carto2gpd.get(url, "crash_data_collision_crash_2007_2017") return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe( geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" gdf = carto2gpd.get(url, "ppr_tree_canopy_outlines_2015" ) ### connection terminated when I try return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe( geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
def download(cls, **kwargs): # Query CARTO gdf = carto2gpd.get("https://phl.carto.com/api/v2/sql", "li_demolitions") return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe( geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
def download(cls, **kwargs): # grab the permit data from CARTO df = carto2gpd.get( "https://phl.carto.com/api/v2/sql", "li_permits", where=("permitissuedate < '2019-01-01' " "AND permitdescription='NEW CONSTRUCTION PERMIT'"), ) return (df.dropna(subset=["geometry"]).to_crs(epsg=EPSG).assign( x=lambda df: df.geometry.x, y=lambda df: df.geometry.y))
def download(cls, **kwargs): # grab the crime data from CARTO df = carto2gpd.get( "https://phl.carto.com/api/v2/sql", "public_cases_fc", where=("requested_datetime < '2019-01-01'" " AND service_name = 'Abandoned Vehicle'"), ) return (df.dropna(subset=["geometry"]).to_crs(epsg=EPSG).assign( x=lambda df: df.geometry.x, y=lambda df: df.geometry.y))
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" gdf = carto2gpd.get(url, "wastebaskets_big_belly") return ( gdf.to_crs(epsg=EPSG) .pipe(geocode, ZIPCodes.get()) .pipe(geocode, Neighborhoods.get()) .pipe(geocode, PUMAs.get()) )
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" where = "extract(year from date_added) = 2017" gdf = carto2gpd.get(url, "streets_code_violation_notices", where=where) return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe( geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get()).pipe( geocode, PUMAs.get()).assign( date_added=lambda df: pd.to_datetime(df.date_added), year=lambda df: df.date_added.dt.year, ).sort_values("date_added", ascending=False).reset_index(drop=True))
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" where = "extract(year from sr_calldate) = 2018" gdf = carto2gpd.get(url, "li_serv_req", where=where) return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe( geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get()).pipe( geocode, PUMAs.get()).assign( sr_calldate=lambda df: pd.to_datetime(df.sr_calldate), year=lambda df: df.sr_calldate.dt.year, ).sort_values("sr_calldate", ascending=False).reset_index(drop=True))
def download(cls, **kwargs): # grab the crime data from CARTO df = carto2gpd.get( "https://phl.carto.com/api/v2/sql", "incidents_part1_part2", where= ("dispatch_date < '2019-01-01'" " AND Text_General_Code IN ('Aggravated Assault No Firearm', 'Aggravated Assault Firearm')" ), ) return (df.dropna(subset=["geometry"]).to_crs(epsg=EPSG).assign( x=lambda df: df.geometry.x, y=lambda df: df.geometry.y))
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" gdf = carto2gpd.get(url, "public_cases_fc") return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe( geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get()).pipe( geocode, PUMAs.get()).assign( requested_datetime=lambda df: pd.to_datetime( df.requested_datetime), year=lambda df: df.requested_datetime.dt.year, ).sort_values("requested_datetime", ascending=False).reset_index(drop=True))
def download(cls, year=2018): gdf = carto2gpd.get( "https://phl.carto.com/api/v2/sql", "RTT_SUMMARY", where=f"extract(year from DISPLAY_DATE) = {year}", ) return ( gdf.to_crs(epsg=EPSG) .drop(labels=["zip_code"], axis=1) .pipe(geocode, ZIPCodes.get()) .pipe(geocode, Neighborhoods.get()) .pipe(geocode, PUMAs.get()) )
def download(cls, **kwargs): url = "https://phl.carto.com/api/v2/sql" gdf = (replace_missing_geometries(carto2gpd.get( url, "shootings")).fillna(np.nan).to_crs(epsg=EPSG)) return (gdf.pipe(geocode, ZIPCodes.get()).pipe( geocode, Neighborhoods.get()).pipe(geocode, PoliceDistricts.get()).assign( time=lambda df: df.time.replace("<Null>", np.nan).fillna( "00:00:00"), date=lambda df: pd.to_datetime( df.date_.str.slice(0, 10).str.cat(df.time, sep=" ")), ).drop(labels=["point_x", "point_y", "date_", "time", "objectid"], axis=1).sort_values("date", ascending=False).reset_index(drop=True))
def download(cls, year=DEFAULT_YEAR): # Query CARTO gdf = carto2gpd.get( "https://phl.carto.com/api/v2/sql", "li_permits", where= f"extract(year from permitissuedate) = {year} and permitdescription = 'NEW CONSTRUCTION PERMIT'", ) return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe( geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get( )).pipe(geocode, PUMAs.get()).assign( permitissuedate=lambda df: pd.to_datetime(df.permitissuedate), year=lambda df: df.permitissuedate.dt.year, ).sort_values("permitissuedate", ascending=False).reset_index(drop=True))
def download(cls, year=DEFAULT_YEAR): # Query carto for all active licenses from a specific year gdf = carto2gpd.get( "https://phl.carto.com/api/v2/sql", "li_business_licenses", where= f"extract(year from initialissuedate) = {year} and licensestatus = 'Active'", ) # Geocode and return return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe( geocode, ZIPCodes.get() ).pipe(geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()).assign( initialissuedate=lambda df: pd.to_datetime(df.initialissuedate), year=lambda df: df.initialissuedate.dt.year, ).sort_values("initialissuedate", ascending=False).reset_index(drop=True))
def download(cls, year=2017): # Query CARTO gdf = carto2gpd.get( "https://phl.carto.com/api/v2/sql", "parking_violations", where=f"extract(year from issue_datetime) = {year}", ) return ( replace_missing_geometries(gdf) .to_crs(epsg=EPSG) .pipe(geocode, ZIPCodes.get()) .pipe(geocode, Neighborhoods.get()) .pipe(geocode, PUMAs.get()) .assign( issue_datetime=lambda df: pd.to_datetime(df.issue_datetime), year=lambda df: df.issue_datetime.dt.year, ) .sort_values("issue_datetime", ascending=False) .reset_index(drop=True) )
def download(cls, year=DEFAULT_YEAR): # Query CARTO gdf = carto2gpd.get( "https://phl.carto.com/api/v2/sql", "shootings", where=f"year = {year}" ) return ( replace_missing_geometries(gdf) .to_crs(epsg=EPSG) .pipe(geocode, ZIPCodes.get()) .pipe(geocode, Neighborhoods.get()) .pipe(geocode, PUMAs.get()) .assign( time=lambda df: df.time.replace("<Null>", np.nan).fillna("00:00:00"), date=lambda df: pd.to_datetime( df.date_.str.slice(0, 10).str.cat(df.time, sep=" ") ), ) .drop(labels=["point_x", "point_y", "date_", "time", "objectid"], axis=1) .sort_values("date", ascending=False) .reset_index(drop=True) )
def get(self, fresh=False, update_local=True): """Get the shooting victims data, either loading the currently downloaded version or a fresh copy.""" if fresh or not self.path.exists(): if self.debug: logger.debug("Downloading shooting victims database") df = ( carto2gpd.get(self.ENDPOINT, self.TABLE_NAME) .assign( time=lambda df: df.time.replace("<Null>", np.nan).fillna( "00:00:00" ), date=lambda df: pd.to_datetime( df.date_.str.slice(0, 10).str.cat(df.time, sep=" ") ), year=lambda df: df.date.dt.year, race=lambda df: df.race.fillna("Other/Unknown"), age=lambda df: df.age.astype(float), age_group=lambda df: np.select( [ df.age <= 17, (df.age > 17) & (df.age <= 30), (df.age > 30) & (df.age <= 45), (df.age > 45), ], ["Younger than 18", "18 to 30", "31 to 45", "Older than 45"], default="Unknown", ), ) .assign( race=lambda df: df.race.where(df.latino != 1, other="H"), age=lambda df: df.age.fillna("Unknown"), ) .drop( labels=["point_x", "point_y", "date_", "time", "objectid"], axis=1 ) .sort_values("date", ascending=False) .reset_index(drop=True) ) # CHECKS old_df = gpd.read_file(self.path) if len(df) - len(old_df) > 100: logger.info(f"Length of new data: {len(df)}") logger.info(f"Length of old data: {len(old_df)}") raise ValueError( "New data seems to have too many rows...please manually confirm new data is correct." ) def _add_geo_info(data, geo): return gpd.sjoin(data, geo, how="left", op="within").drop( labels=["index_right"], axis=1 ) # Add geographic columns df = ( df.pipe(_add_geo_info, get_zip_codes().to_crs(df.crs)) .pipe(_add_geo_info, get_police_districts().to_crs(df.crs)) .pipe(_add_geo_info, get_council_districts().to_crs(df.crs)) .pipe(_add_geo_info, get_neighborhoods().to_crs(df.crs)) ) # Save it if update_local: if self.debug: logger.debug("Updating saved copy of shooting victims database") df.to_file(self.path, driver="GeoJSON") # Load from disk, fill missing geometries and convert CRS return ( gpd.read_file(self.path) .assign( geometry=lambda df: df.geometry.fillna(Point()), date=lambda df: pd.to_datetime(df.date), ) .to_crs(epsg=EPSG) )
def test_bad_where(): url = "https://phl.carto.com/api/v2/sql" where = "bad_column > 0" with pytest.raises(ValueError): gdf = carto2gpd.get(url, "shootings", where=where, limit=5)
def test_no_geometry(): url = "https://phl.carto.com/api/v2/sql" df = carto2gpd.get(url, "li_com_act_licenses", limit=5) assert isinstance(df, pd.DataFrame)
def test_limit(): url = "https://phl.carto.com/api/v2/sql" gdf = carto2gpd.get(url, "shootings", limit=5) assert len(gdf) == 5
def get(self, fresh=False, update_local=True): """Get the shooting victims data, either loading the currently downloaded version or a fresh copy.""" if fresh or not self.path.exists(): if self.debug: logger.debug("Downloading shooting victims database") # Raw data df = carto2gpd.get(self.ENDPOINT, self.TABLE_NAME) # Verify DC key first missing_dc_keys = df["dc_key"].isnull() if missing_dc_keys.sum() and not self.ignore_checks: n = missing_dc_keys.sum() raise ValueError(f"Found {n} rows with missing DC keys") # Format df = ( df.assign( time=lambda df: df.time.replace("<Null>", np.nan).fillna( "00:00:00" ), date=lambda df: pd.to_datetime( df.date_.str.slice(0, 10).str.cat(df.time, sep=" ") ), dc_key=lambda df: df.dc_key.astype(float).astype(int).astype(str), year=lambda df: df.date.dt.year, race=lambda df: df.race.fillna("Other/Unknown"), age=lambda df: df.age.astype(float), age_group=lambda df: np.select( [ df.age <= 17, (df.age > 17) & (df.age <= 30), (df.age > 30) & (df.age <= 45), (df.age > 45), ], ["Younger than 18", "18 to 30", "31 to 45", "Older than 45"], default="Unknown", ), ) .assign( race=lambda df: df.race.where(df.latino != 1, other="H"), age=lambda df: df.age.fillna("Unknown"), ) .drop( labels=["point_x", "point_y", "date_", "time", "objectid"], axis=1 ) .sort_values("date", ascending=False) .reset_index(drop=True) ) # CHECKS if not self.ignore_checks: old_df = gpd.read_file(self.path) TOLERANCE = 100 if len(df) - len(old_df) > TOLERANCE: logger.info(f"Length of new data: {len(df)}") logger.info(f"Length of old data: {len(old_df)}") raise ValueError( "New data seems to have too many rows...please manually confirm new data is correct." ) TOLERANCE = 10 if len(old_df) - len(df) > TOLERANCE: logger.info(f"Length of new data: {len(df)}") logger.info(f"Length of old data: {len(old_df)}") raise ValueError( "New data seems to have too few rows...please manually confirm new data is correct." ) # Add geographic info df = add_geographic_info(df) # Save it if update_local: if self.debug: logger.debug("Updating saved copy of shooting victims database") df.to_file(self.path, driver="GeoJSON") # Load from disk, fill missing geometries and convert CRS out = ( gpd.read_file(self.path, dtype={"dc_key": str}) .assign( geometry=lambda df: df.geometry.fillna(Point()), date=lambda df: pd.to_datetime(df.date), ) .to_crs(epsg=EPSG) ) # Check dc_key is properly formatted assert ( out["dc_key"].str.contains(".0", regex=False).sum() == 0 ), "dc_key not properly formatted" return out
def add_geographic_info(df): """Add geographic info.""" # Get a fresh copy df = df.copy().reset_index(drop=True) # The original length original_length = len(df) # Check city limits city_limits = get_city_limits().to_crs(df.crs) outside_limits = ~df.geometry.within(city_limits.squeeze().geometry) missing = outside_limits.sum() # Set missing geometry to null logger.info(f"{missing} shootings outside city limits") if missing > 0: df.loc[missing, "geometry"] = np.nan # Try to replace any missing geometries from criminal incidents dc_key_list = ", ".join( df.loc[df.geometry.isnull(), "dc_key"].apply(lambda x: f"'{x}'") ) url = "https://phl.carto.com/api/v2/sql" incidents = carto2gpd.get( url, "incidents_part1_part2", where=f"dc_key IN ( {dc_key_list} )" ) # Did we get any matches matches = len(incidents) logger.info(f"Found {matches} matches for {missing} missing geometries") # Merge if matches > 0: missing_sel = df.geometry.isnull() missing = df.loc[missing_sel] df2 = missing.drop(columns=["geometry"]).merge( incidents[["dc_key", "geometry"]].drop_duplicates(subset=["dc_key"]), on="dc_key", how="left", ) df = pd.concat([df.loc[~missing_sel], df2]).reset_index(drop=True) def _add_geo_info(data, geo): out = gpd.sjoin(data, geo, how="left", predicate="within") # NOTE: sometimes this will match multiple geo boundaries # REMOVE THEM duplicated = out.index.duplicated() if duplicated.sum(): out = out.loc[~duplicated] return out.drop(labels=["index_right"], axis=1) # Add geographic columns geo_funcs = [ get_zip_codes, get_police_districts, get_council_districts, get_neighborhoods, get_school_catchments, get_pa_house_districts, ] for geo_func in geo_funcs: df = df.pipe(_add_geo_info, geo_func().to_crs(df.crs)) # if geo columns are missing, geometry should be NaN df.loc[df["hood"].isnull(), "geometry"] = np.nan # Check the length if len(df) != original_length: raise ValueError("Length of data has changed; this shouldn't happen!") return df