Пример #1
0
    def download(cls, **kwargs):
        url = "https://phl.carto.com/api/v2/sql"
        fields = [
            "dc_dist",
            "dc_key",
            "dispatch_date_time",
            "location_block",
            "psa",
            "text_general_code",
            "ucr_general",
        ]
        where = "text_general_code LIKE '%Homicide - Criminal%'"
        gdf = replace_missing_geometries(
            carto2gpd.get(url,
                          "incidents_part1_part2",
                          fields=fields,
                          where=where)).to_crs(epsg=EPSG)

        return (gdf.pipe(geocode, ZIPCodes.get()).pipe(
            geocode,
            Neighborhoods.get()).pipe(geocode, PoliceDistricts.get()).assign(
                dispatch_date_time=lambda df: pd.to_datetime(
                    df.dispatch_date_time),
                year=lambda df: df.dispatch_date_time.dt.year,
                text_general_code=lambda df: df.text_general_code.str.strip(),
                time_offset=lambda df: (df.dispatch_date_time - pd.to_datetime(
                    "1/1/2006").tz_localize("UTC")).dt.total_seconds().values,
            ).sort_values("dispatch_date_time",
                          ascending=False).reset_index(drop=True))
Пример #2
0
def get_data(days):
    """
    Query the CARTO database to get shootings from the recent past.
    
    Parameters
    ----------
    days : int
        the number of days to get data for
    
    Returns
    -------
    gdf : GeoDataFrame
        the data frame holding the queried data
    """
    # Query for the data
    URL = "https://phl.carto.com/api/v2/sql"
    WHERE = f"date_ >= current_date - {days}"
    gdf = carto2gpd.get(URL, "shootings", where=WHERE)

    # Add lat/lng columns
    gdf["lat"] = gdf.geometry.y
    gdf["lng"] = gdf.geometry.x

    # Remove rows with missing geo coordinates
    gdf = gdf.dropna(subset=["lat", "lng"])

    return gdf
Пример #3
0
def covid_realtime():
    """
    Get real-time accumulated covid data by each zip code.
    """
    url = "https://phl.carto.com/api/v2/sql"
    gdf = carto2gpd.get(url, "covid_cases_by_zip",fields=['zip_code', 'count', "etl_timestamp"])
    return gdf
Пример #4
0
    def download(cls, **kwargs):

        url = "https://phl.carto.com/api/v2/sql"
        gdf = carto2gpd.get(url, "ppr_tree_canopy_points_2015")

        return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe(
            geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
Пример #5
0
def get_data(days):
    """
    Query the CARTO database to get shootings from the recent past.
    
    Parameters
    ----------
    days : int
        the number of days to get data for
    
    Returns
    -------
    gdf : GeoDataFrame
        the data frame holding the queried data
    """
    # Query for the data
    URL = "https://phl.carto.com/api/v2/sql"
    WHERE = f"date_ >= current_date - {days}"
    gdf = carto2gpd.get(URL, "shootings", where=WHERE)

    # Re-map the fatal column to Yes/No
    gdf["fatal"] = gdf["fatal"].map({0: "No", 1: "Yes"})

    # Remove entries where fatal is NaN
    gdf = gdf.dropna(subset=["fatal"])

    # Remove missing geometries
    gdf = gdf.loc[gdf.geometry.notnull()]

    return gdf
Пример #6
0
    def download(cls, **kwargs):

        # the raw data
        fields = [
            "dc_dist",
            "dc_key",
            "dispatch_date_time",
            "location_block",
            "psa",
            "text_general_code",
            "ucr_general",
        ]

        url = "https://phl.carto.com/api/v2/sql"
        where = "extract(year from dispatch_date_time) = 2018"
        gdf = carto2gpd.get(url,
                            "incidents_part1_part2",
                            fields=fields,
                            where=where)

        return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe(
            geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get()).pipe(
                geocode, PUMAs.get()).assign(
                    dispatch_date_time=lambda df: pd.to_datetime(
                        df.dispatch_date_time),
                    year=lambda df: df.dispatch_date_time.dt.year,
                ).sort_values("dispatch_date_time",
                              ascending=False).reset_index(drop=True))
Пример #7
0
    def download(cls, **kwargs):

        url = "https://phl.carto.com/api/v2/sql"
        gdf = carto2gpd.get(url, "real_estate_tax_delinquencies")

        return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe(
            geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
Пример #8
0
    def download(cls):

        # todays date
        today = pd.datetime.today().strftime("%Y-%m-%d")

        # load all licenses
        carto_url = "https://phl.carto.com/api/v2/sql"
        where = f"issuedate < '{today}'"
        return (carto2gpd.get(
            carto_url, table_name="li_com_act_licenses",
            where=where).query("legalentitytype == 'Company'").drop(
                labels=[
                    "geometry",
                    "licensetype",
                    "objectid",
                    "revenuecode",
                    "legalentitytype",
                    "legalfirstname",
                    "legallastname",
                ],
                axis=1,
                errors="ignore",
            ).rename(columns=dict(
                companyname="company_name",
                issuedate="issue_date",
                licensestatus="license_status",
                licensenum="license_num",
            )).sort_values(by="issue_date",
                           ascending=True).reset_index(drop=True))
    def download(cls, **kwargs):

        url = "https://phl.carto.com/api/v2/sql"
        gdf = carto2gpd.get(url, "crash_data_collision_crash_2007_2017")

        return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe(
            geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
Пример #10
0
    def download(cls, **kwargs):

        url = "https://phl.carto.com/api/v2/sql"
        gdf = carto2gpd.get(url, "ppr_tree_canopy_outlines_2015"
                            )  ### connection terminated when I try

        return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe(
            geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
Пример #11
0
    def download(cls, **kwargs):

        # Query CARTO
        gdf = carto2gpd.get("https://phl.carto.com/api/v2/sql",
                            "li_demolitions")

        return (gdf.to_crs(epsg=EPSG).pipe(geocode, ZIPCodes.get()).pipe(
            geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()))
    def download(cls, **kwargs):

        # grab the permit data from CARTO
        df = carto2gpd.get(
            "https://phl.carto.com/api/v2/sql",
            "li_permits",
            where=("permitissuedate < '2019-01-01' "
                   "AND permitdescription='NEW CONSTRUCTION PERMIT'"),
        )
        return (df.dropna(subset=["geometry"]).to_crs(epsg=EPSG).assign(
            x=lambda df: df.geometry.x, y=lambda df: df.geometry.y))
    def download(cls, **kwargs):

        # grab the crime data from CARTO
        df = carto2gpd.get(
            "https://phl.carto.com/api/v2/sql",
            "public_cases_fc",
            where=("requested_datetime < '2019-01-01'"
                   " AND service_name = 'Abandoned Vehicle'"),
        )
        return (df.dropna(subset=["geometry"]).to_crs(epsg=EPSG).assign(
            x=lambda df: df.geometry.x, y=lambda df: df.geometry.y))
Пример #14
0
    def download(cls, **kwargs):

        url = "https://phl.carto.com/api/v2/sql"
        gdf = carto2gpd.get(url, "wastebaskets_big_belly")  

        return (
            gdf.to_crs(epsg=EPSG) 
            .pipe(geocode, ZIPCodes.get())
            .pipe(geocode, Neighborhoods.get())
            .pipe(geocode, PUMAs.get())
        )
Пример #15
0
    def download(cls, **kwargs):
        url = "https://phl.carto.com/api/v2/sql"
        where = "extract(year from date_added) = 2017"
        gdf = carto2gpd.get(url, "streets_code_violation_notices", where=where)

        return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe(
            geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get()).pipe(
                geocode, PUMAs.get()).assign(
                    date_added=lambda df: pd.to_datetime(df.date_added),
                    year=lambda df: df.date_added.dt.year,
                ).sort_values("date_added",
                              ascending=False).reset_index(drop=True))
Пример #16
0
    def download(cls, **kwargs):
        url = "https://phl.carto.com/api/v2/sql"
        where = "extract(year from sr_calldate) = 2018"
        gdf = carto2gpd.get(url, "li_serv_req", where=where)

        return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe(
            geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get()).pipe(
                geocode, PUMAs.get()).assign(
                    sr_calldate=lambda df: pd.to_datetime(df.sr_calldate),
                    year=lambda df: df.sr_calldate.dt.year,
                ).sort_values("sr_calldate",
                              ascending=False).reset_index(drop=True))
    def download(cls, **kwargs):

        # grab the crime data from CARTO
        df = carto2gpd.get(
            "https://phl.carto.com/api/v2/sql",
            "incidents_part1_part2",
            where=
            ("dispatch_date < '2019-01-01'"
             " AND Text_General_Code IN ('Aggravated Assault No Firearm', 'Aggravated Assault Firearm')"
             ),
        )
        return (df.dropna(subset=["geometry"]).to_crs(epsg=EPSG).assign(
            x=lambda df: df.geometry.x, y=lambda df: df.geometry.y))
Пример #18
0
    def download(cls, **kwargs):

        url = "https://phl.carto.com/api/v2/sql"
        gdf = carto2gpd.get(url, "public_cases_fc")

        return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe(
            geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get()).pipe(
                geocode, PUMAs.get()).assign(
                    requested_datetime=lambda df: pd.to_datetime(
                        df.requested_datetime),
                    year=lambda df: df.requested_datetime.dt.year,
                ).sort_values("requested_datetime",
                              ascending=False).reset_index(drop=True))
    def download(cls, year=2018):

        gdf = carto2gpd.get(
            "https://phl.carto.com/api/v2/sql",
            "RTT_SUMMARY",
            where=f"extract(year from DISPLAY_DATE) = {year}",
        )
        return (
            gdf.to_crs(epsg=EPSG)
            .drop(labels=["zip_code"], axis=1)
            .pipe(geocode, ZIPCodes.get())
            .pipe(geocode, Neighborhoods.get())
            .pipe(geocode, PUMAs.get())
        )
Пример #20
0
    def download(cls, **kwargs):
        url = "https://phl.carto.com/api/v2/sql"
        gdf = (replace_missing_geometries(carto2gpd.get(
            url, "shootings")).fillna(np.nan).to_crs(epsg=EPSG))

        return (gdf.pipe(geocode, ZIPCodes.get()).pipe(
            geocode,
            Neighborhoods.get()).pipe(geocode, PoliceDistricts.get()).assign(
                time=lambda df: df.time.replace("<Null>", np.nan).fillna(
                    "00:00:00"),
                date=lambda df: pd.to_datetime(
                    df.date_.str.slice(0, 10).str.cat(df.time, sep=" ")),
            ).drop(labels=["point_x", "point_y", "date_", "time", "objectid"],
                   axis=1).sort_values("date",
                                       ascending=False).reset_index(drop=True))
Пример #21
0
    def download(cls, year=DEFAULT_YEAR):

        # Query CARTO
        gdf = carto2gpd.get(
            "https://phl.carto.com/api/v2/sql",
            "li_permits",
            where=
            f"extract(year from permitissuedate) = {year} and permitdescription = 'NEW CONSTRUCTION PERMIT'",
        )

        return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe(
            geocode, ZIPCodes.get()).pipe(geocode, Neighborhoods.get(
            )).pipe(geocode, PUMAs.get()).assign(
                permitissuedate=lambda df: pd.to_datetime(df.permitissuedate),
                year=lambda df: df.permitissuedate.dt.year,
            ).sort_values("permitissuedate",
                          ascending=False).reset_index(drop=True))
    def download(cls, year=DEFAULT_YEAR):

        # Query carto for all active licenses from a specific year
        gdf = carto2gpd.get(
            "https://phl.carto.com/api/v2/sql",
            "li_business_licenses",
            where=
            f"extract(year from initialissuedate) = {year} and licensestatus = 'Active'",
        )

        # Geocode and return
        return (replace_missing_geometries(gdf).to_crs(epsg=EPSG).pipe(
            geocode, ZIPCodes.get()
        ).pipe(geocode, Neighborhoods.get()).pipe(geocode, PUMAs.get()).assign(
            initialissuedate=lambda df: pd.to_datetime(df.initialissuedate),
            year=lambda df: df.initialissuedate.dt.year,
        ).sort_values("initialissuedate",
                      ascending=False).reset_index(drop=True))
Пример #23
0
    def download(cls, year=2017):

        # Query CARTO
        gdf = carto2gpd.get(
            "https://phl.carto.com/api/v2/sql",
            "parking_violations",
            where=f"extract(year from issue_datetime) = {year}",
        )

        return (
            replace_missing_geometries(gdf)
            .to_crs(epsg=EPSG)
            .pipe(geocode, ZIPCodes.get())
            .pipe(geocode, Neighborhoods.get())
            .pipe(geocode, PUMAs.get())
            .assign(
                issue_datetime=lambda df: pd.to_datetime(df.issue_datetime),
                year=lambda df: df.issue_datetime.dt.year,
            )
            .sort_values("issue_datetime", ascending=False)
            .reset_index(drop=True)
        )
Пример #24
0
    def download(cls, year=DEFAULT_YEAR):

        # Query CARTO
        gdf = carto2gpd.get(
            "https://phl.carto.com/api/v2/sql", "shootings", where=f"year = {year}"
        )

        return (
            replace_missing_geometries(gdf)
            .to_crs(epsg=EPSG)
            .pipe(geocode, ZIPCodes.get())
            .pipe(geocode, Neighborhoods.get())
            .pipe(geocode, PUMAs.get())
            .assign(
                time=lambda df: df.time.replace("<Null>", np.nan).fillna("00:00:00"),
                date=lambda df: pd.to_datetime(
                    df.date_.str.slice(0, 10).str.cat(df.time, sep=" ")
                ),
            )
            .drop(labels=["point_x", "point_y", "date_", "time", "objectid"], axis=1)
            .sort_values("date", ascending=False)
            .reset_index(drop=True)
        )
Пример #25
0
    def get(self, fresh=False, update_local=True):
        """Get the shooting victims data, either loading
        the currently downloaded version or a fresh copy."""

        if fresh or not self.path.exists():

            if self.debug:
                logger.debug("Downloading shooting victims database")

            df = (
                carto2gpd.get(self.ENDPOINT, self.TABLE_NAME)
                .assign(
                    time=lambda df: df.time.replace("<Null>", np.nan).fillna(
                        "00:00:00"
                    ),
                    date=lambda df: pd.to_datetime(
                        df.date_.str.slice(0, 10).str.cat(df.time, sep=" ")
                    ),
                    year=lambda df: df.date.dt.year,
                    race=lambda df: df.race.fillna("Other/Unknown"),
                    age=lambda df: df.age.astype(float),
                    age_group=lambda df: np.select(
                        [
                            df.age <= 17,
                            (df.age > 17) & (df.age <= 30),
                            (df.age > 30) & (df.age <= 45),
                            (df.age > 45),
                        ],
                        ["Younger than 18", "18 to 30", "31 to 45", "Older than 45"],
                        default="Unknown",
                    ),
                )
                .assign(
                    race=lambda df: df.race.where(df.latino != 1, other="H"),
                    age=lambda df: df.age.fillna("Unknown"),
                )
                .drop(
                    labels=["point_x", "point_y", "date_", "time", "objectid"], axis=1
                )
                .sort_values("date", ascending=False)
                .reset_index(drop=True)
            )

            # CHECKS
            old_df = gpd.read_file(self.path)
            if len(df) - len(old_df) > 100:
                logger.info(f"Length of new data: {len(df)}")
                logger.info(f"Length of old data: {len(old_df)}")
                raise ValueError(
                    "New data seems to have too many rows...please manually confirm new data is correct."
                )

            def _add_geo_info(data, geo):
                return gpd.sjoin(data, geo, how="left", op="within").drop(
                    labels=["index_right"], axis=1
                )

            # Add geographic columns
            df = (
                df.pipe(_add_geo_info, get_zip_codes().to_crs(df.crs))
                .pipe(_add_geo_info, get_police_districts().to_crs(df.crs))
                .pipe(_add_geo_info, get_council_districts().to_crs(df.crs))
                .pipe(_add_geo_info, get_neighborhoods().to_crs(df.crs))
            )

            # Save it
            if update_local:
                if self.debug:
                    logger.debug("Updating saved copy of shooting victims database")
                df.to_file(self.path, driver="GeoJSON")

        # Load from disk, fill missing geometries and convert CRS
        return (
            gpd.read_file(self.path)
            .assign(
                geometry=lambda df: df.geometry.fillna(Point()),
                date=lambda df: pd.to_datetime(df.date),
            )
            .to_crs(epsg=EPSG)
        )
Пример #26
0
def test_bad_where():
    url = "https://phl.carto.com/api/v2/sql"
    where = "bad_column > 0"
    with pytest.raises(ValueError):
        gdf = carto2gpd.get(url, "shootings", where=where, limit=5)
Пример #27
0
def test_no_geometry():
    url = "https://phl.carto.com/api/v2/sql"
    df = carto2gpd.get(url, "li_com_act_licenses", limit=5)
    assert isinstance(df, pd.DataFrame)
Пример #28
0
def test_limit():
    url = "https://phl.carto.com/api/v2/sql"
    gdf = carto2gpd.get(url, "shootings", limit=5)
    assert len(gdf) == 5
Пример #29
0
    def get(self, fresh=False, update_local=True):
        """Get the shooting victims data, either loading
        the currently downloaded version or a fresh copy."""

        if fresh or not self.path.exists():

            if self.debug:
                logger.debug("Downloading shooting victims database")

            # Raw data
            df = carto2gpd.get(self.ENDPOINT, self.TABLE_NAME)

            # Verify DC key first
            missing_dc_keys = df["dc_key"].isnull()
            if missing_dc_keys.sum() and not self.ignore_checks:
                n = missing_dc_keys.sum()
                raise ValueError(f"Found {n} rows with missing DC keys")

            # Format
            df = (
                df.assign(
                    time=lambda df: df.time.replace("<Null>", np.nan).fillna(
                        "00:00:00"
                    ),
                    date=lambda df: pd.to_datetime(
                        df.date_.str.slice(0, 10).str.cat(df.time, sep=" ")
                    ),
                    dc_key=lambda df: df.dc_key.astype(float).astype(int).astype(str),
                    year=lambda df: df.date.dt.year,
                    race=lambda df: df.race.fillna("Other/Unknown"),
                    age=lambda df: df.age.astype(float),
                    age_group=lambda df: np.select(
                        [
                            df.age <= 17,
                            (df.age > 17) & (df.age <= 30),
                            (df.age > 30) & (df.age <= 45),
                            (df.age > 45),
                        ],
                        ["Younger than 18", "18 to 30", "31 to 45", "Older than 45"],
                        default="Unknown",
                    ),
                )
                .assign(
                    race=lambda df: df.race.where(df.latino != 1, other="H"),
                    age=lambda df: df.age.fillna("Unknown"),
                )
                .drop(
                    labels=["point_x", "point_y", "date_", "time", "objectid"], axis=1
                )
                .sort_values("date", ascending=False)
                .reset_index(drop=True)
            )

            # CHECKS
            if not self.ignore_checks:
                old_df = gpd.read_file(self.path)
                TOLERANCE = 100
                if len(df) - len(old_df) > TOLERANCE:
                    logger.info(f"Length of new data: {len(df)}")
                    logger.info(f"Length of old data: {len(old_df)}")
                    raise ValueError(
                        "New data seems to have too many rows...please manually confirm new data is correct."
                    )

                TOLERANCE = 10
                if len(old_df) - len(df) > TOLERANCE:
                    logger.info(f"Length of new data: {len(df)}")
                    logger.info(f"Length of old data: {len(old_df)}")
                    raise ValueError(
                        "New data seems to have too few rows...please manually confirm new data is correct."
                    )

            # Add geographic info
            df = add_geographic_info(df)

            # Save it
            if update_local:
                if self.debug:
                    logger.debug("Updating saved copy of shooting victims database")
                df.to_file(self.path, driver="GeoJSON")

        # Load from disk, fill missing geometries and convert CRS
        out = (
            gpd.read_file(self.path, dtype={"dc_key": str})
            .assign(
                geometry=lambda df: df.geometry.fillna(Point()),
                date=lambda df: pd.to_datetime(df.date),
            )
            .to_crs(epsg=EPSG)
        )

        # Check dc_key is properly formatted
        assert (
            out["dc_key"].str.contains(".0", regex=False).sum() == 0
        ), "dc_key not properly formatted"

        return out
Пример #30
0
def add_geographic_info(df):
    """Add geographic info."""

    # Get a fresh copy
    df = df.copy().reset_index(drop=True)

    # The original length
    original_length = len(df)

    # Check city limits
    city_limits = get_city_limits().to_crs(df.crs)
    outside_limits = ~df.geometry.within(city_limits.squeeze().geometry)
    missing = outside_limits.sum()

    # Set missing geometry to null
    logger.info(f"{missing} shootings outside city limits")
    if missing > 0:
        df.loc[missing, "geometry"] = np.nan

    # Try to replace any missing geometries from criminal incidents
    dc_key_list = ", ".join(
        df.loc[df.geometry.isnull(), "dc_key"].apply(lambda x: f"'{x}'")
    )
    url = "https://phl.carto.com/api/v2/sql"
    incidents = carto2gpd.get(
        url, "incidents_part1_part2", where=f"dc_key IN ( {dc_key_list} )"
    )

    # Did we get any matches
    matches = len(incidents)
    logger.info(f"Found {matches} matches for {missing} missing geometries")

    # Merge
    if matches > 0:

        missing_sel = df.geometry.isnull()
        missing = df.loc[missing_sel]
        df2 = missing.drop(columns=["geometry"]).merge(
            incidents[["dc_key", "geometry"]].drop_duplicates(subset=["dc_key"]),
            on="dc_key",
            how="left",
        )
        df = pd.concat([df.loc[~missing_sel], df2]).reset_index(drop=True)

    def _add_geo_info(data, geo):
        out = gpd.sjoin(data, geo, how="left", predicate="within")

        # NOTE: sometimes this will match multiple geo boundaries
        # REMOVE THEM
        duplicated = out.index.duplicated()
        if duplicated.sum():
            out = out.loc[~duplicated]

        return out.drop(labels=["index_right"], axis=1)

    # Add geographic columns
    geo_funcs = [
        get_zip_codes,
        get_police_districts,
        get_council_districts,
        get_neighborhoods,
        get_school_catchments,
        get_pa_house_districts,
    ]
    for geo_func in geo_funcs:
        df = df.pipe(_add_geo_info, geo_func().to_crs(df.crs))

    # if geo columns are missing, geometry should be NaN
    df.loc[df["hood"].isnull(), "geometry"] = np.nan

    # Check the length
    if len(df) != original_length:
        raise ValueError("Length of data has changed; this shouldn't happen!")

    return df