def testRenameWithRegex(self) -> None:
        # Arrange
        subject = pd.DataFrame(
            {
                "County": ["Anderson", "Andrews", "Angelina"],
                "PRETRIAL": [90, 20, 105],
                "CON. Felons": [2, 11, 26],
            }
        )

        rename_dict = {
            r".*Cou.*": "facility_name",
            r"PRETRIAL": "pretrial_adp",
            r"CON\. Felons": "convicted_adp",
        }

        # Act
        result = aggregate_ingest_utils.rename_columns_and_select(
            subject, rename_dict, use_regex=True
        )

        # Assert
        expected_result = pd.DataFrame(
            {
                "facility_name": ["Anderson", "Andrews", "Angelina"],
                "pretrial_adp": [90, 20, 105],
                "convicted_adp": [2, 11, 26],
            }
        )

        assert_frame_equal(result, expected_result)
Пример #2
0
def _format_df(df: pd.DataFrame) -> pd.DataFrame:
    """Format the DataFrame to match the schema."""
    result = _transpose_df(df)

    result = aggregate_ingest_utils.rename_columns_and_select(result, {
        'report_date': 'report_date',
        'Census': 'census',
        'In House': 'in_house',
        'Boarded In': 'boarded_in',
        'Boarded Out': 'boarded_out',
        '- Sentenced': 'sentenced',
        '- Civil': 'civil',
        '- Federal': 'federal',
        '- Technical Parole Violators': 'technical_parole_violators',
        '- State Readies': 'state_readies',
        '- Other Unsentenced **': 'other_unsentenced'
    })

    result['report_date'] = result['report_date'].apply(_parse_report_date)

    for column_name in set(result.columns) - {'report_date'}:
        result[column_name] = result[column_name].apply(locale.atoi)

    result['facility_name'] = df['FACILITY'].iloc[0]

    return result
Пример #3
0
def _parse_county_table(filename: str) -> pd.DataFrame:
    """Parses the FL County - Table 1 in the PDF."""
    [result] = tabula.read_pdf(
        filename,
        pages=[3, 4],
        multiple_tables=False,
        pandas_options={"skipfooter": 1, "engine": "python"},
    )

    result.columns = [c.replace("\r", " ") for c in result.columns]
    result = aggregate_ingest_utils.rename_columns_and_select(
        result,
        {
            "Florida County": "county_name",
            "County Population": "county_population",
            "Average Daily Population (ADP)": "average_daily_population",
            "*Date Reported": "date_reported",
        },
    )

    # Drop rows from header on second table (page 4)
    result = result[~result["county_name"].isin(("Florida", "County"))]

    for column_name in {"county_population", "average_daily_population"}:
        result[column_name] = result[column_name].apply(locale.atoi)

    # Sometimes extra notes are indicated in the date reported field.
    result["date_reported"] = result["date_reported"].str.replace(r"^\*\*$", "")

    result["date_reported"] = pd.to_datetime(result["date_reported"])

    return result
Пример #4
0
def _parse_county_table(location: str, filename: str) -> pd.DataFrame:
    """Parses the FL County - Table 1 in the PDF."""
    part1 = read_pdf(location,
                     filename,
                     pages=[3],
                     pandas_options={
                         'header': [0, 1],
                     })
    part2 = read_pdf(
        location,
        filename,
        pages=[4],
        pandas_options={
            'header': [0, 1],
            'skipfooter': 1,  # The last row is the total
            'engine': 'python'  # Only python engine supports 'skipfooter'
        })
    result = part1.append(part2, ignore_index=True)

    result.columns = aggregate_ingest_utils.collapse_header(result.columns)
    result = aggregate_ingest_utils.rename_columns_and_select(
        result, {
            'Florida County': 'county_name',
            'County Population': 'county_population',
            'Average Daily Population (ADP)': 'average_daily_population',
            '*Date Reported': 'date_reported'
        })

    for column_name in {'county_population', 'average_daily_population'}:
        result[column_name] = result[column_name].apply(locale.atoi)
    result['date_reported'] = pd.to_datetime(result['date_reported'])

    return result
Пример #5
0
def _format_df(df: pd.DataFrame) -> pd.DataFrame:
    """Format the DataFrame to match the schema."""
    result = _transpose_df(df)

    result = aggregate_ingest_utils.rename_columns_and_select(
        result,
        {
            "report_date": "report_date",
            "Census": "census",
            "In House": "in_house",
            "Boarded In": "boarded_in",
            "Boarded Out": "boarded_out",
            "- Sentenced": "sentenced",
            "- Civil": "civil",
            "- Federal": "federal",
            "- Technical Parole Violators": "technical_parole_violators",
            "- State Readies": "state_readies",
            "- Other Unsentenced **": "other_unsentenced",
        },
    )

    result["report_date"] = result["report_date"].apply(_parse_report_date)

    for column_name in set(result.columns) - {"report_date"}:
        result[column_name] = result[column_name].apply(
            lambda d: int(d) if isinstance(d, (int, float)) else 0
            if "(" in d else locale.atoi(d))

    result["facility_name"] = df["FACILITY"].iloc[0]

    return result
Пример #6
0
def _parse_table(filename: str) -> pd.DataFrame:
    """Parses the CA aggregate report."""

    # Although the file is downloaded with the '.xls' extension, the contents of
    # the file are in the shape of an HTML file.
    df = pd.read_html(filename, header=0)[0]
    df = df.fillna(0)

    df['report_date'] = df[['Year', 'Month']].apply(_last_date_of_month,
                                                    axis='columns')

    df = aggregate_ingest_utils.rename_columns_and_select(
        df, {
            'Jurisdiction': 'jurisdiction_name',
            'Facility': 'facility_name',
            'Total facility ADP': 'average_daily_population',
            'Unsentenced males': 'unsentenced_male_adp',
            'Unsentenced females': 'unsentenced_female_adp',
            'Sentenced males': 'sentenced_male_adp',
            'Sentenced females': 'sentenced_female_adp',
            'report_date': 'report_date'
        })

    string_columns = {'jurisdiction_name', 'facility_name', 'report_date'}
    df = aggregate_ingest_utils.cast_columns_to_int(
        df, ignore_columns=string_columns)

    return df
Пример #7
0
def _parse_table(filename: str) -> pd.DataFrame:
    """Parses the CA aggregate report."""

    # Although the file is downloaded with the '.xls' extension, the contents of
    # the file are in the shape of an HTML file.
    df = pd.read_html(filename, header=0)[0]
    df = df.fillna(0)

    df["report_date"] = df[["Year", "Month"]].apply(_last_date_of_month, axis="columns")

    df = aggregate_ingest_utils.rename_columns_and_select(
        df,
        {
            "Jurisdiction": "jurisdiction_name",
            "Facility": "facility_name",
            "Total facility ADP": "average_daily_population",
            "Unsentenced males": "unsentenced_male_adp",
            "Unsentenced females": "unsentenced_female_adp",
            "Sentenced males": "sentenced_male_adp",
            "Sentenced females": "sentenced_female_adp",
            "report_date": "report_date",
        },
    )

    string_columns = {"jurisdiction_name", "facility_name", "report_date"}
    df = aggregate_ingest_utils.cast_columns_to_int(df, ignore_columns=string_columns)

    return df
Пример #8
0
    def testRenameWithRegex(self):
        # Arrange
        subject = pd.DataFrame({
            'County': ['Anderson', 'Andrews', 'Angelina'],
            'PRETRIAL': [90, 20, 105],
            'CON. Felons': [2, 11, 26],
        })

        rename_dict = {
            r'.*Cou.*': 'facility_name',
            r'PRETRIAL': 'pretrial_adp',
            r'CON\. Felons': 'convicted_adp'
        }

        # Act
        result = aggregate_ingest_utils.rename_columns_and_select(
            subject, rename_dict, use_regex=True)

        # Assert
        expected_result = pd.DataFrame({
            'facility_name': ['Anderson', 'Andrews', 'Angelina'],
            'pretrial_adp': [90, 20, 105],
            'convicted_adp': [2, 11, 26],
        })

        assert_frame_equal(result, expected_result)
Пример #9
0
def _parse_table(location: str, filename: str) -> pd.DataFrame:
    """Parses the last table in the GA PDF."""

    # Set column names since the pdf makes them hard to parse directly
    column_names = [
        'Index', 'Jurisdiction', 'Total Number of Inmates In Jail',
        'Jail Capacity', 'Inmates as % of Capacity',
        'Number of Inmates Sentenced to State [Number]',
        'Number of Inmates Sentenced to State [% of Total]',
        'Number of Inmates Awaiting Trial in Jail [Number]',
        'Number of Inmates Awaiting Trial in Jail [% of Total]',
        'Number of Inmates Serving County Sentence [Number]',
        'Number of Inmates Serving County Sentence [% of Total]',
        'Number of Other Inmates [Number]',
        'Number of Other Inmates [% of Total]'
    ]

    # Tables at the end of the doc contain all data we want to parse
    pages = [8, 9, 10, 11]

    # Use lattice parsing since default parsing fails to parse columns on
    # the right half of the page
    use_lattice = True

    result = read_pdf(
        location,
        filename,
        pages=pages,
        lattice=use_lattice,
        pandas_options={
            'names': column_names,
            'skiprows': _header_on_each_page(),
            'skipfooter': 1,  # The last row is the grand totals
            'engine': 'python'  # Only python engine supports 'skipfooter'
        })

    result = aggregate_ingest_utils.rename_columns_and_select(
        result, {
            'Jurisdiction': 'county_name',
            'Total Number of Inmates In Jail':
            'total_number_of_inmates_in_jail',
            'Jail Capacity': 'jail_capacity',
            'Number of Inmates Sentenced to State [Number]':
            'number_of_inmates_sentenced_to_state',
            'Number of Inmates Awaiting Trial in Jail [Number]':
            'number_of_inmates_awaiting_trial',
            'Number of Inmates Serving County Sentence [Number]':
            'number_of_inmates_serving_county_sentence',
            'Number of Other Inmates [Number]': 'number_of_other_inmates'
        })

    # Tabula may parse extra empty rows
    result = result.dropna()

    aggregate_ingest_utils.cast_columns_to_int(result,
                                               ignore_columns={'county_name'})

    return result
Пример #10
0
def _parse_facility_table(_: str, filename: str) -> pd.DataFrame:
    """Parse the FL County Pretrial Inmate Report - Table 2 in the PDF."""
    # Set column names directly since the pdf format makes them hard to parse
    column_names = [
        "Detention Facility Name",
        "Average Daily Population",
        "Number Felony Pretrial",
        "Number Misdemeanor Pretrial",
        "Total Percent Pretrial",
    ]

    part1 = tabula.read_pdf(
        filename,
        pages=[5],
        pandas_options={
            "skiprows": [0, 1, 2],
            "names": column_names,
        },
    )
    part2 = tabula.read_pdf(
        filename,
        pages=[6],
        pandas_options={
            "skiprows": [0, 1, 2],
            "usecols": [0, 2, 3, 4, 5],  # Column 1 contains no data
            "names": column_names,
            "skipfooter": 2,  # The last 2 rows are the totals
            "engine": "python",  # Only python engine supports 'skipfooter'
        },
    )
    result = part1.append(part2, ignore_index=True)

    result = aggregate_ingest_utils.rename_columns_and_select(
        result,
        {
            "Detention Facility Name": "facility_name",
            "Average Daily Population": "average_daily_population",
            "Number Felony Pretrial": "number_felony_pretrial",
            "Number Misdemeanor Pretrial": "number_misdemeanor_pretrial",
        },
    )

    result["average_daily_population"] = (
        result["average_daily_population"].apply(_use_stale_adp).apply(_to_int)
    )
    for column_name in {"number_felony_pretrial", "number_misdemeanor_pretrial"}:
        result[column_name] = result[column_name].apply(_to_int)

    return result
Пример #11
0
def _parse_tab_1(filename: str) -> pd.DataFrame:
    """Parses the first tab in the PA aggregate report."""
    column_names = {
        r"County Name": "facility_name",
        r"Bed Capacity": "bed_capacity",
        r".*Community Corrections Beds.*":
        "work_release_community_corrections_beds",
        r".*In-House Daily Pop.*": "in_house_adp",
        r".*Housed Elsewhere Daily Pop.*": "housed_elsewhere_adp",
        r".*In-House Work Release.*": "work_release_adp",
        r"Admissions": "admissions",
        r"Discharge": "discharge",
    }

    # Parse everything directly to allow us to correctly map "N/A" and "N/R"
    keep_default_na = False
    df = pd.read_excel(
        filename,
        sheet_name=0,
        header=1,
        keep_default_na=keep_default_na,
        engine="openpyxl",
    )

    # Drop "F/T" and "P/T" line
    df = df[1:]

    # Drop Totals footer
    df = df[:-9]

    df.columns = df.columns.map(lambda name: name.rstrip(" "))
    df = aggregate_ingest_utils.rename_columns_and_select(df,
                                                          column_names,
                                                          use_regex=True)

    # Some cells have extra '*'
    df = df.applymap(lambda e: str(e).rstrip(" *"))

    df = df.apply(_to_numeric)

    df["report_date"] = _report_date_tab_1(filename)
    df = fips.add_column_to_df(df, df["facility_name"], us.states.PA)
    df["aggregation_window"] = enum_strings.yearly_granularity
    df["report_frequency"] = enum_strings.yearly_granularity

    return df.reset_index(drop=True)
Пример #12
0
def _parse_facility_table(location: str, filename: str) -> pd.DataFrame:
    """Parse the FL County Pretrial Inmate Report - Table 2 in the PDF."""
    # Set column names directly since the pdf format makes them hard to parse
    column_names = [
        'Detention Facility Name',
        'Average Daily Population',
        'Number Felony Pretrial',
        'Number Misdemeanor Pretrial',
        'Total Percent Pretrial']

    part1 = read_pdf(
        location,
        filename,
        pages=[5],
        pandas_options={
            'skiprows': [0, 1, 2],
            'names': column_names,
        })
    part2 = read_pdf(
        location,
        filename,
        pages=[6],
        pandas_options={
            'skiprows': [0, 1, 2],
            'usecols': [0, 2, 3, 4, 5],  # Column 1 contains no data
            'names': column_names,
            'skipfooter': 2,  # The last 2 rows are the totals
            'engine': 'python'  # Only python engine supports 'skipfooter'
        })
    result = part1.append(part2, ignore_index=True)

    result = aggregate_ingest_utils.rename_columns_and_select(result, {
        'Detention Facility Name': 'facility_name',
        'Average Daily Population': 'average_daily_population',
        'Number Felony Pretrial': 'number_felony_pretrial',
        'Number Misdemeanor Pretrial': 'number_misdemeanor_pretrial'
    })

    result['average_daily_population'] = result[
        'average_daily_population'].apply(_use_stale_adp).apply(_to_int)
    for column_name in {'number_felony_pretrial',
                        'number_misdemeanor_pretrial'}:
        result[column_name] = result[column_name].apply(_to_int)

    return result
Пример #13
0
def _parse_tab_1(filename: str) -> pd.DataFrame:
    """Parses the first tab in the PA aggregate report."""
    column_names = {
        r'County Name': 'facility_name',
        r'Bed Capacity': 'bed_capacity',
        r'.*Community Corrections Beds.*':
        'work_release_community_corrections_beds',
        r'.*In-House Daily Pop.*': 'in_house_adp',
        r'.*Housed Elsewhere Daily Pop.*': 'housed_elsewhere_adp',
        r'.*In-House Work Release.*': 'work_release_adp',
        r'Admissions': 'admissions',
        r'Discharge': 'discharge'
    }

    # Parse everything directly to allow us to correctly map "N/A" and "N/R"
    keep_default_na = False
    df = pd.read_excel(filename,
                       sheet_name=0,
                       header=1,
                       keep_default_na=keep_default_na)

    # Drop "F/T" and "P/T" line
    df = df[1:]

    # Drop Totals footer
    df = df[:-9]

    df.columns = df.columns.map(lambda name: name.rstrip(' '))
    df = aggregate_ingest_utils.rename_columns_and_select(df,
                                                          column_names,
                                                          use_regex=True)

    # Some cells have extra '*'
    df = df.applymap(lambda e: str(e).rstrip(' *'))

    df = df.apply(_to_numeric)

    df['report_date'] = _report_date_tab_1(filename)
    df = fips.add_column_to_df(df, df['facility_name'], us.states.PA)
    df['aggregation_window'] = enum_strings.yearly_granularity
    df['report_frequency'] = enum_strings.yearly_granularity

    return df.reset_index(drop=True)
Пример #14
0
def _parse_facility_table(filename: str) -> pd.DataFrame:
    """Parse the FL County Pretrial Inmate Report - Table 2 in the PDF."""
    # Set column names directly since the pdf format makes them hard to parse
    column_names = [
        "Detention Facility Name",
        "Average Daily Population",
        "Number Felony Pretrial",
        "Number Misdemeanor Pretrial",
        "Total Percent Pretrial",
    ]
    [result] = tabula.read_pdf(
        filename,
        pages=[5, 6],
        multiple_tables=False,
        pandas_options={
            "usecols": range(1, 6),
            "names": column_names,
            "skiprows": [0],
            "skipfooter": 2,
            "engine": "python",
        },
    )

    result = aggregate_ingest_utils.rename_columns_and_select(
        result,
        {
            "Detention Facility Name": "facility_name",
            "Average Daily Population": "average_daily_population",
            "Number Felony Pretrial": "number_felony_pretrial",
            "Number Misdemeanor Pretrial": "number_misdemeanor_pretrial",
        },
    )
    result = result.replace("Detention\rFacility\rName", None).dropna(how="all")

    result["average_daily_population"] = (
        result["average_daily_population"].apply(_use_stale_adp).apply(_to_int)
    )
    for column_name in {"number_felony_pretrial", "number_misdemeanor_pretrial"}:
        result[column_name] = result[column_name].apply(_to_int)

    return result
Пример #15
0
def _parse_county_table(_: str, filename: str) -> pd.DataFrame:
    """Parses the FL County - Table 1 in the PDF."""
    part1 = tabula.read_pdf(
        filename,
        pages=[3],
        pandas_options={
            "header": [0, 1],
        },
    )
    part2 = tabula.read_pdf(
        filename,
        pages=[4],
        pandas_options={
            "header": [0, 1],
            "skipfooter": 1,  # The last row is the total
            "engine": "python",  # Only python engine supports 'skipfooter'
        },
    )
    result = part1.append(part2, ignore_index=True)

    result.columns = aggregate_ingest_utils.collapse_header(result.columns)
    result = aggregate_ingest_utils.rename_columns_and_select(
        result,
        {
            "Florida County": "county_name",
            "County Population": "county_population",
            "Average Daily Population (ADP)": "average_daily_population",
            "*Date Reported": "date_reported",
        },
    )

    for column_name in {"county_population", "average_daily_population"}:
        result[column_name] = result[column_name].apply(locale.atoi)

    # Sometimes extra notes are indicated in the date reported field.
    result["date_reported"] = result["date_reported"].str.replace(r"^\*\*$", "")

    result["date_reported"] = pd.to_datetime(result["date_reported"])

    return result
Пример #16
0
def _parse_table(_, filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = tabula.read_pdf(
        filename,
        pages='all',
        lattice=True
    )

    if filename.endswith('04-16-20.pdf'):
        whole_df[323:331] = whole_df[323:331].shift(-1, axis='columns')
    elif filename.endswith('07-09-20.pdf'):
        whole_df.loc[432] = whole_df.iloc[432].shift(-1)
        whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis='columns')
        whole_df.loc[438] = whole_df.iloc[438].shift(-1)
        whole_df.loc[440] = whole_df.iloc[440].shift(-1)
        whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis='columns')
        whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis='columns')
        whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis='columns')
        whole_df.loc[451, 'County'] = 86
        whole_df.loc[456, 'County'] = 264
        whole_df.loc[461, 'County'] = 52
        whole_df.loc[464, 'County'] = 161
        whole_df.loc[469, 'County'] = 70
        whole_df.loc[472, 'County'] = 204
        whole_df.loc[477, 'County'] = 182
        whole_df.loc[482, 'County'] = 137
        whole_df.loc[487, 'County'] = 45
        whole_df.loc[492, 'County'] = 410
        whole_df.loc[497, 'County'] = 152
        whole_df.loc[500, 'County'] = 95
        whole_df.loc[505, 'County'] = 85
        whole_df.loc[508, 'County'] = 194
        whole_df.loc[513, 'County'] = 72
        whole_df.loc[516, 'County'] = 134
        whole_df.loc[521, 'County'] = 50
        whole_df.loc[524, 'County'] = 63
        whole_df.loc[529, 'County'] = 32

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(whole_df['Date'].str.contains('Totals'))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df['County'].astype(str).str.contains('Secure')
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns')

    whole_df = whole_df[whole_df['County'].astype(str) != 'County']

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace('\n', ' ')
    whole_df.columns = whole_df.columns.str.replace('\r', ' ')

    # Column names can change over time : (
    column_name_map = {
        'CC Eligible Inmates': 'Community Custody Inmates',
    }
    whole_df.columns = [column_name_map[c] if c in column_name_map else c
                        for c in whole_df.columns]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(whole_df['Total Jail Beds'].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # This is a typo in several reports
        if '12/' in df['Federal Inmates'].values:
            df['Federal Inmates'] = df['Federal Inmates'].replace({'12/': '12'})

        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'})

        df['Gender'] = None
        df = _collapse_by_gender_rows(df, 'Male')
        df = _collapse_by_gender_rows(df, 'Female')

        # The first row contains header data for both Male and Female
        df['County'] = df['County'][0]
        df['total_jail_beds'] = df['Total Jail Beds'][0]
        df['reported_population'] = \
            df['Reported Population (Total and Male/Female)'][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender['Gender'] == 'Male']
    female_df = df_by_gender[df_by_gender['Gender'] == 'Female']

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(female_df, {
        'County': 'facility_name',
        'total_jail_beds': 'total_jail_beds',
        'reported_population': 'reported_population',
    })

    male_df = aggregate_ingest_utils.rename_columns_and_select(male_df, {
        'County': 'facility_name',
        # Since we've grouped by Male, this Reported Population is only Male
        'Reported Population (Total and Male/Female)': 'male_population',
        'Class D Inmates': 'class_d_male_population',
        'Community Custody Inmates': 'community_custody_male_population',
        'Alternative Sentence': 'alternative_sentence_male_population',
        'Controlled Intake': 'controlled_intake_male_population',
        'Parole Violators': 'parole_violators_male_population',
        'Federal Inmates': 'federal_male_population',
    })

    female_df = aggregate_ingest_utils.rename_columns_and_select(female_df, {
        'County': 'facility_name',
        # Since we've grouped by Female, this Reported Population is only Female
        'Reported Population (Total and Male/Female)': 'female_population',
        'Class D Inmates': 'class_d_female_population',
        'Community Custody Inmates': 'community_custody_female_population',
        'Alternative Sentence': 'alternative_sentence_female_population',
        'Controlled Intake': 'controlled_intake_female_population',
        'Parole Violators': 'parole_violators_female_population',
        'Federal Inmates': 'federal_female_population',
    })

    result = shared_df.join(male_df.set_index('facility_name'),
                            on='facility_name')
    result = result.join(female_df.set_index('facility_name'),
                         on='facility_name')

    if filename.endswith('04-16-20.pdf'):
        result.loc[result['facility_name'] == 'Lincoln', 'total_jail_beds'] = 72

    return result.reset_index(drop=True)
Пример #17
0
def _parse_table(filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = one(
        tabula.read_pdf(filename,
                        pages="all",
                        multiple_tables=False,
                        lattice=True))

    if filename.endswith("04-16-20.pdf"):
        whole_df[323:331] = whole_df[323:331].shift(-1, axis="columns")
    elif filename.endswith("07-09-20.pdf"):
        whole_df.loc[432] = whole_df.iloc[432].shift(-1)
        whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis="columns")
        whole_df.loc[438] = whole_df.iloc[438].shift(-1)
        whole_df.loc[440] = whole_df.iloc[440].shift(-1)
        whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis="columns")
        whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis="columns")
        whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis="columns")
        whole_df.loc[451, "County"] = 86
        whole_df.loc[456, "County"] = 264
        whole_df.loc[461, "County"] = 52
        whole_df.loc[464, "County"] = 161
        whole_df.loc[469, "County"] = 70
        whole_df.loc[472, "County"] = 204
        whole_df.loc[477, "County"] = 182
        whole_df.loc[482, "County"] = 137
        whole_df.loc[487, "County"] = 45
        whole_df.loc[492, "County"] = 410
        whole_df.loc[497, "County"] = 152
        whole_df.loc[500, "County"] = 95
        whole_df.loc[505, "County"] = 85
        whole_df.loc[508, "County"] = 194
        whole_df.loc[513, "County"] = 72
        whole_df.loc[516, "County"] = 134
        whole_df.loc[521, "County"] = 50
        whole_df.loc[524, "County"] = 63
        whole_df.loc[529, "County"] = 32

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(
        whole_df["Date"].str.contains("Totals"))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df["County"].astype(str).str.contains("Secure")
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis="columns")

    whole_df = whole_df[whole_df["County"].astype(str) != "County"]

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace("\n", " ")
    whole_df.columns = whole_df.columns.str.replace("\r", " ")

    # Column names can change over time : (
    column_name_map = {
        "CC Eligible Inmates": "Community Custody Inmates",
    }
    whole_df.columns = [
        column_name_map[c] if c in column_name_map else c
        for c in whole_df.columns
    ]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(
        whole_df["Total Jail Beds"].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # This is a typo in several reports
        if "12/" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace(
                {"12/": "12"})
        if "yo" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace({"yo": "0"})
        if "pe" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace({"pe": "0"})
        if "(" in df["Reported Population (Total and Male/Female)"].values:
            df["Reported Population (Total and Male/Female)"] = df[
                "Reported Population (Total and Male/Female)"].replace(
                    {"(": "0"})

        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df,
            ignore_columns={"County", "Facility Security", "Inmate Cusody"})

        df["Gender"] = None
        df = _collapse_by_gender_rows(df, "Male")
        df = _collapse_by_gender_rows(df, "Female")

        # The first row contains header data for both Male and Female
        df["County"] = df["County"][0]
        df["total_jail_beds"] = df["Total Jail Beds"][0]
        df["reported_population"] = df[
            "Reported Population (Total and Male/Female)"][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender["Gender"] == "Male"]
    female_df = df_by_gender[df_by_gender["Gender"] == "Female"]

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            "County": "facility_name",
            "total_jail_beds": "total_jail_beds",
            "reported_population": "reported_population",
        },
    )

    male_df = aggregate_ingest_utils.rename_columns_and_select(
        male_df,
        {
            "County": "facility_name",
            # Since we've grouped by Male, this Reported Population is only Male
            "Reported Population (Total and Male/Female)": "male_population",
            "Class D Inmates": "class_d_male_population",
            "Community Custody Inmates": "community_custody_male_population",
            "Alternative Sentence": "alternative_sentence_male_population",
            "Controlled Intake": "controlled_intake_male_population",
            "Parole Violators": "parole_violators_male_population",
            "Federal Inmates": "federal_male_population",
        },
    )

    female_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            "County": "facility_name",
            # Since we've grouped by Female, this Reported Population is only Female
            "Reported Population (Total and Male/Female)": "female_population",
            "Class D Inmates": "class_d_female_population",
            "Community Custody Inmates": "community_custody_female_population",
            "Alternative Sentence": "alternative_sentence_female_population",
            "Controlled Intake": "controlled_intake_female_population",
            "Parole Violators": "parole_violators_female_population",
            "Federal Inmates": "federal_female_population",
        },
    )

    result = shared_df.join(male_df.set_index("facility_name"),
                            on="facility_name")
    result = result.join(female_df.set_index("facility_name"),
                         on="facility_name")

    if filename.endswith("04-16-20.pdf"):
        result.loc[result["facility_name"] == "Lincoln",
                   "total_jail_beds"] = 72

    return result.reset_index(drop=True)
Пример #18
0
def _parse_table(filename: str, report_date: datetime.date) -> pd.DataFrame:
    """Parses the last table in the GA PDF."""

    # Set column names since the pdf makes them hard to parse directly
    column_names = [
        "Index",
        "Jurisdiction",
        "Total Number of Inmates In Jail",
        "Jail Capacity",
        "Inmates as % of Capacity",
        "Number of Inmates Sentenced to State [Number]",
        "Number of Inmates Sentenced to State [% of Total]",
        "Number of Inmates Awaiting Trial in Jail [Number]",
        "Number of Inmates Awaiting Trial in Jail [% of Total]",
        "Number of Inmates Serving County Sentence [Number]",
        "Number of Inmates Serving County Sentence [% of Total]",
        "Number of Other Inmates [Number]",
        "Number of Other Inmates [% of Total]",
    ]

    # Tables at the end of the doc contain all data we want to parse
    pages = [8, 9, 10, 11]

    # Use lattice parsing since default parsing fails to parse columns on
    # the right half of the page
    use_lattice = True

    if filename.endswith("jun_19.pdf"):
        # Tabula can't handle the multiple tables because it thinks the one on
        # the last page has extra columns. This concats them manually.
        *dfs, df4 = tabula.read_pdf(filename,
                                    pages=pages,
                                    lattice=use_lattice,
                                    multiple_tables=True)
        df4 = df4.iloc[:-1, 1:14]
        df4.columns = range(13)
        df4.iloc[33, 1] = df4.iloc[33, 1].strip(" '")
        dfs.append(df4)
        result = pd.concat(df.iloc[1:] for df in dfs)
        result.columns = column_names
    elif report_date >= datetime.date(2020, 11, 5):
        # Skip every 48th row for new-style reports
        result = one(
            tabula.read_pdf(
                filename,
                pages=pages,
                lattice=use_lattice,
                multiple_tables=False,
                pandas_options={
                    "names": column_names,
                    "skiprows": [x * 48 for x in range(4)],
                    "skipfooter": 1,  # The last row is the grand totals
                    "engine":
                    "python",  # Only python engine supports 'skipfooter'
                },
            ))
    else:
        result = one(
            tabula.read_pdf(
                filename,
                pages=pages,
                lattice=use_lattice,
                multiple_tables=False,
                pandas_options={
                    "names": column_names,
                    "skiprows": _header_on_each_page(),
                    "skipfooter": 1,  # The last row is the grand totals
                    "engine":
                    "python",  # Only python engine supports 'skipfooter'
                },
            ))

    result = aggregate_ingest_utils.rename_columns_and_select(
        result,
        {
            "Jurisdiction": "county_name",
            "Total Number of Inmates In Jail":
            "total_number_of_inmates_in_jail",
            "Jail Capacity": "jail_capacity",
            "Number of Inmates Sentenced to State [Number]":
            "number_of_inmates_sentenced_to_state",
            "Number of Inmates Awaiting Trial in Jail [Number]":
            "number_of_inmates_awaiting_trial",
            "Number of Inmates Serving County Sentence [Number]":
            "number_of_inmates_serving_county_sentence",
            "Number of Other Inmates [Number]": "number_of_other_inmates",
        },
    )

    # Tabula may parse extra empty rows
    result = result.dropna()

    aggregate_ingest_utils.cast_columns_to_int(result,
                                               ignore_columns={"county_name"})

    return result
Пример #19
0
def _parse_table(location, filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = read_pdf(location, filename, pages='all', lattice=True)

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(
        whole_df['Date'].str.contains('Totals'))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df['County'].astype(str).str.contains('Secure')
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns')

    whole_df = whole_df[whole_df['County'].astype(str) != 'County']

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace('\n', ' ')
    whole_df.columns = whole_df.columns.str.replace('\r', ' ')

    # Column names can change over time : (
    column_name_map = {
        'CC Eligible Inmates': 'Community Custody Inmates',
    }
    whole_df.columns = [
        column_name_map[c] if c in column_name_map else c
        for c in whole_df.columns
    ]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(
        whole_df['Total Jail Beds'].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df,
            ignore_columns={'County', 'Facility Security', 'Inmate Cusody'})

        df['Gender'] = None
        df = _collapse_by_gender_rows(df, 'Male')
        df = _collapse_by_gender_rows(df, 'Female')

        # The first row contains header data for both Male and Female
        df['County'] = df['County'][0]
        df['total_jail_beds'] = df['Total Jail Beds'][0]
        df['reported_population'] = \
            df['Reported Population (Total and Male/Female)'][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender['Gender'] == 'Male']
    female_df = df_by_gender[df_by_gender['Gender'] == 'Female']

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df, {
            'County': 'facility_name',
            'total_jail_beds': 'total_jail_beds',
            'reported_population': 'reported_population',
        })

    male_df = aggregate_ingest_utils.rename_columns_and_select(
        male_df,
        {
            'County': 'facility_name',
            # Since we've grouped by Male, this Reported Population is only Male
            'Reported Population (Total and Male/Female)': 'male_population',
            'Class D Inmates': 'class_d_male_population',
            'Community Custody Inmates': 'community_custody_male_population',
            'Alternative Sentence': 'alternative_sentence_male_population',
            'Controlled Intake': 'controlled_intake_male_population',
            'Parole Violators': 'parole_violators_male_population',
            'Federal Inmates': 'federal_male_population',
        })

    female_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            'County': 'facility_name',
            # Since we've grouped by Female, this Reported Population is only Female
            'Reported Population (Total and Male/Female)': 'female_population',
            'Class D Inmates': 'class_d_female_population',
            'Community Custody Inmates': 'community_custody_female_population',
            'Alternative Sentence': 'alternative_sentence_female_population',
            'Controlled Intake': 'controlled_intake_female_population',
            'Parole Violators': 'parole_violators_female_population',
            'Federal Inmates': 'federal_female_population',
        })

    result = shared_df.join(male_df.set_index('facility_name'),
                            on='facility_name')
    result = result.join(female_df.set_index('facility_name'),
                         on='facility_name')

    return result.reset_index(drop=True)
Пример #20
0
def _parse_table(location: str, filename: str,
                 report_date: datetime.date) -> pd.DataFrame:
    """Parses the TX County Table in the PDF."""
    num_pages = 9
    columns_to_schema = _get_column_names(report_date)

    pages = []
    for page_num in range(1, num_pages + 1):
        # Each page has 1 or more tables on it with the last table being the
        # one with the data on it.  The headers are poorly read by tabula and
        # some years have different responses to this call so we generally
        # just get all of the tables and consider only the one with numbers on
        # it.  That lets us clean it up by dropping nonsense columns and rows,
        # and then assigning our own columns names to them.
        df = read_pdf(
            location,
            filename,
            multiple_tables=True,
            pages=page_num,
        )
        df = df[-1]
        df = df.dropna(axis='columns', thresh=5)
        # We want to remove all of the rows and columns that have no data.
        numeric_elements = df.apply(pd.to_numeric, errors='coerce').notnull()
        rows_containing_data = numeric_elements.any(axis='columns')
        df = df.loc[rows_containing_data]
        # Next we finally break up some of the columns that were incorrectly
        # concatenated.
        for column in df.columns[1:]:
            # By this point we should only have numeric data in the rows,
            # if this happens it means some columns were concatenated and they
            # must be split.  If the columns are concatenated, we need only
            # check one of the rows for a space because they are all
            # concatenated.
            if ' ' in df[column].iloc[0]:
                index_to_insert = df.columns.get_loc(column)
                df_temp = pd.DataFrame(
                    df.pop(column).str.split(n=1, expand=True))
                df.insert(index_to_insert, str(column) + '_a', df_temp[0])
                df.insert(index_to_insert + 1, str(column) + '_b', df_temp[1])
        pages.append(df)

    # Drop last rows since it's the 'Totals' section
    pages[-1] = pages[-1].drop(pages[-1].tail(1).index)

    # Build result for all the pages.  We rename the columns before calling
    # concat because the column names might all be different.  Renaming them
    # allows concat to pass happily.
    columns_to_drop = ['percent_capacity', 'total_local']
    for i, page in enumerate(pages):
        page.columns = columns_to_schema.keys()
        page = aggregate_ingest_utils.rename_columns_and_select(
            page, columns_to_schema)
        # We don't care about % of capacity and total_local so we drop these
        # columns.
        page = page.drop(columns_to_drop, axis='columns')
        pages[i] = page

    result = pd.concat(pages, ignore_index=True)

    for column_name in set(result.columns) - {'facility_name'}:
        result[column_name] = result[column_name].astype(int)

    return result