Пример #1
0
def process_2021_law_website_data() -> pd.DataFrame:
    """Loads the raw 2021 settlement data from the law department csv,
    converts it to a properly formatted dataframe, saves to csv and returns it
    """
    # load the excel file and skip the first 4 rows and the last 7.
    # also make the first unskipped row the headers
    raw_2021_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
            RAW_C.RAW_2021_LAW_WEBSITE_DATA_EXCEL_FILE),
        sheet_name=RAW_C.RAW_2021_LAW_WEBSITE_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
        skipfooter=7,
    )
    # drop empty columns read in for some reason
    raw_2021_df.dropna(axis=1, inplace=True, how="all")
    # verify there are 473 rows in total
    assert raw_2021_df.shape == (473, 8)

    for col in ["DATE TO COMPTROLLER"]:
        raw_2021_df[col] = pd.to_datetime(raw_2021_df[col])

    # fix any whitespace issues
    raw_2021_df = util.strip_and_trim_whitespace(raw_2021_df)

    util.save_df(
        df=raw_2021_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2021_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2021_df
Пример #2
0
def process_2013_law_website_data() -> pd.DataFrame:
    """Loads the raw 2013 settlement data from the law department csv,
    converts it to a properly formatted dataframe, saves to csv and returns it
    """
    # load the excel file and skip the first 4 rows and the last 4.
    # also make the first unskipped row the headers
    raw_2013_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
            RAW_C.RAW_2013_LAW_WEBSITE_DATA_EXCEL_FILE),
        sheet_name=RAW_C.RAW_2013_LAW_WEBSITE_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
        skipfooter=4,
    )
    # verify there are 1068 rows in total
    assert raw_2013_df.shape == (1068, 9)

    # rename the last column to no name
    raw_2013_df.rename(columns={"Unnamed: 8": "Hidden Column"}, inplace=True)

    # fix any whitespace issues
    raw_2013_df = util.strip_and_trim_whitespace(raw_2013_df)

    util.save_df(
        df=raw_2013_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2013_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2013_df
Пример #3
0
def process_2014_law_website_data() -> pd.DataFrame:
    """Loads the raw 2014 settlement data from the law department csv,
    converts it to a properly formatted dataframe, saves to csv and returns it
    """
    # load the excel file and skip the first 3 rows and the last 654.
    # also make the first unskipped row the headers
    raw_2014_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
            RAW_C.RAW_2014_LAW_WEBSITE_DATA_EXCEL_FILE),
        sheet_name=RAW_C.RAW_2014_LAW_WEBSITE_DATA_EXCEL_SHEET,
        header=1,
        skiprows=3,
        skipfooter=654,
    )
    # verify there are 1172 rows in total
    assert raw_2014_df.shape == (1172, 13)

    # drop the hidden comptroller column
    assert raw_2014_df["COMPTROLLER"].isna().all()
    raw_2014_df.drop(columns=["COMPTROLLER"], inplace=True)

    for col in ["EFFECTIVE DATE\n", "DATE TO\nCOMPTROLLER", "DUE DATE"]:
        raw_2014_df[col] = pd.to_datetime(raw_2014_df[col])

    # fix any whitespace issues
    raw_2014_df = util.strip_and_trim_whitespace(raw_2014_df)

    util.save_df(
        df=raw_2014_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2014_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2014_df
Пример #4
0
def save_csv_formatted_foia_cpd_payments_data() -> pd.DataFrame:
    """Loads the raw unmodified 2004 to 2018 cpd payment data,
    changes it into a workable dataframe format, saves that as a csv
    then returns the dataframe as well
    """
    # skip the first 4 rows and last 3 rows
    raw_foia_cpd_payments_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath(
            RAW_C.RAW_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_EXCEL_FILE
        ),
        sheet_name=RAW_C.RAW_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
        skipfooter=3,
    )

    # fix any whitespace issues
    raw_foia_cpd_payments_df = util.strip_and_trim_whitespace(
        raw_foia_cpd_payments_df,
    )

    # now convert to proper dtypes
    raw_foia_cpd_payments_df["DATE TO COMPTROLLER"] = pd.to_datetime(
        raw_foia_cpd_payments_df["DATE TO COMPTROLLER"]
    )

    # now save to csv
    util.save_df(
        df=raw_foia_cpd_payments_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR,
    )
    return raw_foia_cpd_payments_df
Пример #5
0
def save_csv_formatted_quarterly_police_suit_disp_data() -> pd.DataFrame:
    """Loads the raw unmodified quarterly police lawsuits disposition data
    changes it into a workable dataframe format, saves that as a csv
    then returns the dataframe as well
    """
    # load the sheet which currently has multiple subtables in it
    unsplit_raw_foia_police_suits_disp_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath(
            RAW_C.RAW_QUARTERLY_POLICE_SUIT_DISP_FOIA_DATA_EXCEL_FILE
        ),
        sheet_name=RAW_C.RAW_QUARTERLY_POLICE_SUIT_DISP_FOIA_DATA_EXCEL_SHEET,
    )
    # properly format the subtables into one df
    raw_foia_police_suits_disp_df = format_multitable_df(
        unsplit_raw_foia_police_suits_disp_df,
        header_row_value="Docket Number",
        subheading_col_name="Client Department",
    )
    # strip the client department of the leading "Client Department:" value
    raw_foia_police_suits_disp_df["Client Department"] = (
        raw_foia_police_suits_disp_df["Client Department"]
        .str.lstrip("Client Department:")
        .str.strip()
    )

    # now save to csv
    util.save_df(
        df=raw_foia_police_suits_disp_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_QUARTERLY_POLICE_SUIT_DISP_FOIA_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR,
    )
    return raw_foia_police_suits_disp_df
Пример #6
0
def save_csv_formatted_foia_pending_suits_data() -> pd.DataFrame:
    """Loads the raw unmodified pending police lawsuits data,
    changes it into a workable dataframe format, saves that as a csv
    then returns the dataframe as well
    """
    # skip the first 4 rows
    raw_foia_pending_police_suits_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath(
            RAW_C.RAW_PENDING_POLICE_SUITS_FOIA_DATA_EXCEL_FILE
        ),
        sheet_name=RAW_C.RAW_PENDING_POLICE_SUITS_FOIA_DATA_EXCEL_SHEET,
        header=1,
        skipfooter=1,
    )

    # fix any whitespace issues
    raw_foia_pending_police_suits_df = util.strip_and_trim_whitespace(
        raw_foia_pending_police_suits_df,
    )

    # now save to csv
    util.save_df(
        df=raw_foia_pending_police_suits_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_PENDING_POLICE_SUITS_FOTA_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR,
    )
    return raw_foia_pending_police_suits_df
Пример #7
0
def save_csv_formatted_matter_disp_report_data() -> typing.List[pd.DataFrame]:
    """Loads the raw unmodified matter disposition data
    changes it into a workable dataframe formats,
    saves each sheet as a csv then returns a list of the dataframes as well
    """
    output_list = []

    # sheet name, subheading col name and the output csv name
    sheet_save_list = [
        (
            RAW_C.RAW_QUARTERLY_MATTER_DISP_REPORT_BY_DIVISION_FOIA_DATA_EXCEL_SHEET,
            "Division",
            RAW_C.RAW_CSV_FORMATTED_MATTER_DISP_REPORT_BY_DIVISION_FOIA_DATA_CSV,
        ),
        (
            RAW_C.RAW_QUARTERLY_MATTER_DISP_REPORT_BY_DEPARTMENT_FOIA_DATA_EXCEL_SHEET,
            "Client Department",
            RAW_C.RAW_CSV_FORMATTED_MATTER_DISP_REPORT_BY_DEPARTMENT_FOIA_DATA_CSV,
        ),
        (
            RAW_C.RAW_QUARTERLY_MATTER_DISP_REPORT_BY_ASSIGNEE_FOIA_DATA_EXCEL_SHEET,
            "Main Assignee",
            RAW_C.RAW_CSV_FORMATTED_MATTER_DISP_REPORT_BY_ASSIGNEE_FOIA_DATA_CSV,
        ),
    ]

    for sheet_name, subtable_col_name, output_csv_name in sheet_save_list:

        unsplit_df = pd.read_excel(
            io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath(
                RAW_C.RAW_QUARTERLY_MATTER_DISP_REPORT_FOIA_DATA_EXCEL_FILE
            ),
            sheet_name=sheet_name,
        )
        # properly format the subtables into one df
        formatted_df = format_multitable_df(
            unsplit_df,
            header_row_value="Docket Number",
            subheading_col_name=subtable_col_name,
        )

        # strip the client department of the leading "Division:" value
        formatted_df[subtable_col_name] = (
            formatted_df[subtable_col_name]
            .str.lstrip(subtable_col_name + ":")
            .str.strip()
        )
        output_list.append(formatted_df)

        # now save to csv
        util.save_df(
            df=formatted_df,
            file_name=output_csv_name,
            save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR,
        )

    return output_list
Пример #8
0
def save_csv_formatted_foia_tort_payments_data() -> pd.DataFrame:
    """Loads the raw unmodified 2001 to 2007 tort payment data,
    changes it into a workable dataframe format, saves that as a csv
    then returns the dataframe as well
    """
    # skip the first 4 rows
    raw_foia_tort_payments_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath(
            RAW_C.RAW_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_EXCEL_FILE
        ),
        sheet_name=RAW_C.RAW_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
    )

    # fix any whitespace issues
    raw_foia_tort_payments_df = util.strip_and_trim_whitespace(
        raw_foia_tort_payments_df,
    )

    # do some value specific replacements
    payment_replace_dict = {"NONE": 0}

    raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"].replace(
        to_replace=payment_replace_dict,
        inplace=True,
    )
    raw_foia_tort_payments_df["FEES & COSTS ($)"].replace(
        to_replace=payment_replace_dict,
        inplace=True,
    )
    # now convert to proper dtypes
    raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"] = pd.to_numeric(
        raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"]
    )
    raw_foia_tort_payments_df["FEES & COSTS ($)"] = pd.to_numeric(
        raw_foia_tort_payments_df["FEES & COSTS ($)"]
    )
    raw_foia_tort_payments_df["DATE TO COMPTROLLER"] = pd.to_datetime(
        raw_foia_tort_payments_df["DATE TO COMPTROLLER"]
    )

    # now save to csv
    util.save_df(
        df=raw_foia_tort_payments_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR,
    )
    return raw_foia_tort_payments_df
Пример #9
0
def process_2012_law_website_data() -> pd.DataFrame:
    """Loads the raw 2012 settlement data from the law department csv,
    converts it to a properly formatted dataframe, saves to csv and returns it
    """
    # load the excel file and skip the first 4 rows and the last 6.
    # also make the first unskipped row the headers
    raw_2012_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
            RAW_C.RAW_2012_LAW_WEBSITE_DATA_EXCEL_FILE),
        sheet_name=RAW_C.RAW_2012_LAW_WEBSITE_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
        skipfooter=6,
    )
    # verify there are 919 rows in total
    assert raw_2012_df.shape == (919, 8)

    # split into tort and non tort
    assert raw_2012_df.loc[0, "CASE #"] == "TORT"
    assert raw_2012_df.loc[909, "CASE #"] == "NON-TORT"

    raw_2012_df.loc[0:909, "Tort Status"] = "TORT"
    raw_2012_df.loc[909:, "Tort Status"] = "NON-TORT"

    raw_2012_df.drop(index=[0, 909], inplace=True)

    # fix any whitespace issues
    raw_2012_df = util.strip_and_trim_whitespace(raw_2012_df)

    util.save_df(
        df=raw_2012_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2012_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2012_df
Пример #10
0
    if val >= amount:
        return np.nan
    return val
df['ENTRIES_PER_INTERVAL'] = df['ENTRIES_PER_INTERVAL'].apply(cap, amount=1000000)
df['EXITS_PER_INTERVAL']   = df['EXITS_PER_INTERVAL'].apply(cap, amount=1000000)


# add column for busyness which is entries + exits
# this is now very easy given we have <ENTRIES/EXITS>_PER_INTERVAL
df['BUSYNESS_PER_INTERVAL'] = df.ENTRIES_PER_INTERVAL + df.EXITS_PER_INTERVAL

# add any date information that might be important for modeling
# do now so we don't have to redo everytime we tweak our model
df['MINUTE'] = df['DATETIME'].dt.minute
df['HOUR'] = df['DATETIME'].dt.hour
df['MONTH'] = df['DATETIME'].dt.month
df['YEAR'] = df['DATETIME'].dt.year
df['DAYOFWEEK'] = df['DATETIME'].dt.dayofweek
df['DAYOFYEAR'] = df['DATETIME'].dt.dayofyear
df['WEEKOFYEAR'] = df['DATETIME'].dt.weekofyear

# TODO add month busyness?

# TODO descriptive statistics???!
# http://pandas.pydata.org/pandas-docs/stable/api.html#computations-descriptive-stats
# remember, have to have them available at time of prediciton

df.to_csv('augmented.csv')
# save entire dataframe
util.save_df(df, 'turnstile', 'augmented')
Пример #11
0
def process_2008_law_website_data() -> pd.DataFrame:
    """Loads the raw 2008 settlement data from the law department website,
    converts it from pdf to a pandas dataframe, then saves it as a csv
    and returns the dataframe from the function
    """
    # get the path to the pdf
    raw_2008_pdf_path = DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
        RAW_C.RAW_2008_LAW_WEBSITE_DATA_PDF)
    # use camelot to convert the first 55 pages tables to dataframes
    tables = camelot.read_pdf(filepath=str(raw_2008_pdf_path), pages="1-55")
    first_page_header_string = (
        "PAYMENT \nFEES & \nCITY \nDATE \nAMOUNT \nCOSTS "
        "\nDEPARTMENT \nTO \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE "
        "\nINVOLVED \nDISPOSITION \nCOMPTROLLER \nTORT")
    # expected headers for all pages besides the first
    non_first_page_header_string = (
        "PAYMENT \nFEES & \nCITY \nDATE "
        "\nDEPARTMENT \nTO \nAMOUNT \nCOSTS \nINVOLVED \nDISPOSITION "
        "\nCOMPTROLLER \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE")
    # define the dtypes
    raw_2008_df_col_types = collections.OrderedDict({
        "CASE #":
        str,
        "PAYEE":
        str,
        "PAYMENT AMOUNT($)":
        int,
        "FEES & COSTS($)":
        int,
        "PRIMARY CAUSE":
        str,
        "CITY DEPARTMENT INVOLVED":
        str,
        "DISPOSITION":
        str,
        "DATE TO COMPTROLLER":
        np.datetime64,
        "Tort Status":
        str,
        "pdf_page_num":
        int,
    })
    raw_2008_df_cols = raw_2008_df_col_types.keys()

    # special replacements for the payment amount column
    payment_amount_replacements = {
        "A \n5,694": "5694",
        "S \n76,000": "76000",
        "(181)": "-181",
        "(2,374)": "-2374",
    }
    # create an empty dataframe
    raw_2008_df = pd.DataFrame(columns=raw_2008_df_cols)
    last_page = 55

    # append all the pages' tables together
    for index, table in enumerate(tables):
        page_num = index + 1
        # extract the table as a dataframe
        table_df = table.df

        # check every cell besides first one on first row is empty string
        assert table_df.iloc[0].iloc[1:].eq("").all()
        if page_num == 1:
            assert table_df.iloc[0][0] == first_page_header_string
        else:
            assert table_df.iloc[0][0] == non_first_page_header_string
        # now drop that first row
        table_df = table_df.drop(index=[0])

        # special shape on first pass
        # check the first row is just the header values in one cell
        if page_num == 1:
            assert table_df.shape == (44, 8)
            table_df["Tort Status"] = "TORT"
        # special rules for last page
        elif page_num == last_page:
            assert table_df.shape == (47, 8)
            # make everything after 37 non tort and everything before tort
            assert table_df[0].loc[37] == "NON-TORT"
            assert table_df.loc[47][0] == (
                "TOTAL JUDGMENT/VERDICTS & "
                "SETTLEMENTS \n129,670,864 \nTOTAL FEES AND COSTS \n6,903,180")
            table_df.loc[:37, "Tort Status"] = "TORT"
            table_df.loc[37:, "Tort Status"] = "NON-TORT"
            # drop the non tort lable row and the last one
            table_df = table_df.drop(index=[37, 47])
        else:
            # special shape on some pages with 50 rows instead of 51
            if page_num in [12, 41, 47, 49, 51, 53]:
                assert table_df.shape == (49, 8)
            else:
                assert table_df.shape == (50, 8)
            table_df["Tort Status"] = "TORT"

        table_df["pdf_page_num"] = page_num

        # rename the column
        table_df.columns = raw_2008_df_cols
        # fix the issue with the fees and primary cause column getting jumbled
        table_df[[
            "FEES & COSTS($)", "PRIMARY CAUSE"
        ]] = (table_df["FEES & COSTS($)"].str.cat(
            table_df["PRIMARY CAUSE"]).str.extract(FEE_AND_PRIM_CASE_PAT))

        # do special replacements and convert the numerical columns
        table_df["FEES & COSTS($)"] = (table_df["FEES & COSTS($)"].str.replace(
            ",", "").astype(int))
        table_df["PAYMENT AMOUNT($)"].replace(
            payment_amount_replacements,
            inplace=True,
        )
        table_df["PAYMENT AMOUNT($)"] = (
            table_df["PAYMENT AMOUNT($)"].str.replace(",", "").astype(int))

        # now append
        raw_2008_df = raw_2008_df.append(table_df, ignore_index=True)

    # convert to datetime
    raw_2008_df["DATE TO COMPTROLLER"] = pd.to_datetime(
        raw_2008_df["DATE TO COMPTROLLER"])
    # fix dtypes
    raw_2008_df = raw_2008_df.astype(raw_2008_df_col_types)
    # do whitespace fixing
    raw_2008_df = util.strip_and_trim_whitespace(raw_2008_df)

    # save to csv
    util.save_df(
        df=raw_2008_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2008_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2008_df
Пример #12
0
def process_2009_law_website_data() -> pd.DataFrame:
    """Loads the raw 2009 settlement data from the law department website,
    converts it from pdf to a pandas dataframe, then saves it as a csv
    and returns the dataframe
    """
    # get the path to the pdf
    raw_2009_pdf_path = DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
        RAW_C.RAW_2009_LAW_WEBSITE_DATA_PDF)
    # use camelot to convert the first 21 pages tables to dataframes
    tables = camelot.read_pdf(filepath=str(raw_2009_pdf_path), pages="1-21")
    first_page_header_string = (
        "PAYMENT \nFEES & \nCITY \nDATE \nAMOUNT \nCOSTS "
        "\nDEPARTMENT \nTO \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE "
        "\nINVOLVED \nDISPOSITION \nCOMPTROLLER \nTORT")

    # the first pages table has less rows than normal
    raw_2009_df_col_types = collections.OrderedDict({
        "CASE #":
        str,
        "PAYEE":
        str,
        "PAYMENT AMOUNT($)":
        int,
        "FEES & COSTS($)":
        int,
        "PRIMARY CAUSE":
        str,
        "CITY DEPARTMENT INVOLVED":
        str,
        "DISPOSITION":
        str,
        "DATE TO COMPTROLLER":
        np.datetime64,
        "Tort Status":
        str,
        "pdf_page_num":
        int,
    })
    raw_2009_df_cols = raw_2009_df_col_types.keys()

    # special replacements for the payment amount column
    payment_amount_replacements = {
        "(2,144)": "-2144",
        "(14,405)": "-14405",
        "(1,340)": "-1340",
        "(1,000)": "-1000",
        "(1,352)": "-1352",
        "(1,353)": "-1353",
        "(500)": "-500",
        "(1,175)": "-1175",
        "($550.23)": "550.23",
        "$620.00": "620",
        "$223.83": "223.83",
        "$450.00": "450",
        "$2,499.15": "2,499.15",
    }

    last_page = 21
    raw_2009_df = pd.DataFrame(columns=raw_2009_df_cols)

    # append all the pages' tables together
    for index, table in enumerate(tables):
        page_num = index + 1
        # extract the table as a dataframe
        table_df = table.df.copy()

        # special shape on first pass
        # check the first row is just the header values in one cell
        if page_num == 1:
            # check every cell besides first one on first row is empty string
            assert table_df.iloc[0].iloc[1:].eq("").all()
            assert table_df.iloc[0][0] == first_page_header_string
            # now drop that first row
            table_df = table_df.drop(index=[0])
            assert table_df.shape == (46, 8)
            table_df["Tort Status"] = "TORT"
        # special rule for page 20 where there is a split of tort and non-tort
        elif page_num == 20:
            assert table_df.loc[25].iloc[0] == "NON-TORT"
            table_df = table_df.drop(index=[25])
            assert table_df.shape == (54, 8)
            table_df.loc[:25, "Tort Status"] = "TORT"
            table_df.loc[25:, "Tort Status"] = "NON-TORT"
        # special rules for last page
        elif page_num == last_page:
            assert table_df.shape == (47, 8)
            # check the last row is the sums
            assert table_df.loc[46][0] == (
                "TOTAL JUDGMENT/VERDICTS & "
                "SETTLEMENTS \n51,155,053 \nTOTAL FEES AND COSTS \n7,660,924 "
                "\nTOTAL JUDGMENT/VERDICTS, SETTLEMENTS, FEES AND COSTS \n58,815,977"
            )
            table_df["Tort Status"] = "NON-TORT"
            # drop the non tort lable row and the last one
            table_df = table_df.drop(index=[46])
        else:
            assert table_df.shape == (55, 8)
            table_df["Tort Status"] = "TORT"
        # add a page number
        table_df["pdf_page_num"] = page_num

        # rename the column
        table_df.columns = raw_2009_df_cols

        # specific value rename since the number was cutoff
        if page_num == 9:
            table_df.loc[13, "PAYMENT AMOUNT($)"] = "1395000"

        # fix the issue with the fees and primary cause column getting jumbled
        assert table_df["FEES & COSTS($)"].notna().all()
        table_df[[
            "FEES & COSTS($)", "PRIMARY CAUSE"
        ]] = (table_df["FEES & COSTS($)"].astype(str).str.cat(
            table_df["PRIMARY CAUSE"]).str.extract(FEE_AND_PRIM_CASE_PAT))

        # do special replacements and convert the numerical columns
        table_df["FEES & COSTS($)"] = (table_df["FEES & COSTS($)"].str.replace(
            ",", "").astype(int))
        table_df["PAYMENT AMOUNT($)"].replace(
            payment_amount_replacements,
            inplace=True,
        )
        table_df["PAYMENT AMOUNT($)"] = (
            table_df["PAYMENT AMOUNT($)"].str.replace(",", "").astype(float))

        # now append
        raw_2009_df = raw_2009_df.append(table_df, ignore_index=True)

    # convert to datetime
    raw_2009_df["DATE TO COMPTROLLER"] = pd.to_datetime(
        raw_2009_df["DATE TO COMPTROLLER"])
    # fix dtypes
    raw_2009_df = raw_2009_df.astype(raw_2009_df_col_types)
    # do whitespace fixing
    raw_2009_df = util.strip_and_trim_whitespace(raw_2009_df)

    util.save_df(
        df=raw_2009_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2009_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2009_df
Пример #13
0
def reformat_and_save(f):
    print 'old' if is_old_format(f) else 'new', f
    df = read_old(f) if is_old_format(f) else read_new(f)
    util.save_df(df, f, 'reformatted')
Пример #14
0
def reformat_and_save(f):
    print 'old' if is_old_format(f) else 'new', f
    df = read_old(f) if is_old_format(f) else read_new(f)
    util.save_df(df, f, 'reformatted')
Пример #15
0
def clean_and_standardize_all_data() -> None:
    """Cleans, standardizes, and saves Law Website data from each year.

    Cleans, standardizes and saves the Law Website data for each year.
    It also saves a single file with all the years combined into one."""
    # list of tuples with (raw_csv, output_csv)
    processing_list = [
        (
            RAW_C.RAW_CSV_FORMATTED_2008_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2008_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2009_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2009_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2010_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2010_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2011_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2011_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2012_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2012_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2013_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2013_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2014_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2014_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2015_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2015_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2016_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2016_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2017_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2017_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2018_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2018_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2019_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2019_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2020_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2020_LAW_WEBSITE_DATA_CSV,
        ),
        (
            RAW_C.RAW_CSV_FORMATTED_2021_LAW_WEBSITE_DATA_CSV,
            STAN_C.STANDARDIZED_2021_LAW_WEBSITE_DATA_CSV,
        ),
    ]

    rename_dict = STAN_C.LAW_WEBSITE_DATA_COL_STANDARDIZATION_RENAME_DICT
    output_dfs = []

    doc_yr = 2008
    for raw_csv, output_csv in processing_list:
        raw_df = util.load_df(
            file_name=raw_csv,
            save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
        )
        assert raw_df.columns.isin(
            rename_dict.keys()
        ).all(), f"Not all keys in {raw_csv} are in the rename dict!"
        # rename
        standardized_df = raw_df.rename(columns=rename_dict)
        # standardize case number and extract relevant info
        standardized_df = case_num_parsing.standardize_case_num_info(standardized_df)
        # remove 0931 from department name (included one year for some reason)
        standardized_df[STAN_C.CITY_DEPARTMENT_INVOLVED_COL] = standardized_df[
            STAN_C.CITY_DEPARTMENT_INVOLVED_COL
        ].str.rstrip(" 0931")
        # if any numeric columns not numeric then fix them
        already_numeric_cols = standardized_df.select_dtypes("number").columns
        # remove and dollar signs or commas from payment column
        for money_col in [STAN_C.PAYMENT_AMOUNT_COL, STAN_C.FEES_AND_COSTS_COL]:
            # skip if already numberic
            if money_col in already_numeric_cols:
                continue

            standardized_df[money_col] = pd.to_numeric(
                standardized_df[money_col].str.replace(
                    pat=r"[\,\$\s]+", repl="", regex=True
                )
            )
        # add data source
        standardized_df[STAN_C.DATA_SOURCE_COL] = f"law_dept_website_{doc_yr}"
        doc_yr += 1
        # save output
        util.save_df(
            df=standardized_df,
            file_name=output_csv,
            save_dir=DIR_C.CLEANED_AND_STANDARDIZED_LAW_WEBSITE_DATA_DIR,
        )
        output_dfs.append(standardized_df)

    all_yrs_output_df = pd.concat(output_dfs)
    util.save_df(
        df=all_yrs_output_df,
        file_name=STAN_C.STANDARDIZED_ALL_YEARS_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.CLEANED_AND_STANDARDIZED_LAW_WEBSITE_DATA_DIR,
    )