예제 #1
0
def create_ballot_measure_contests(df, columns):
    ballotmeasure_df = (
        df["ContestSelectionJoin"]
        .merge(
            df["BallotMeasureContest"],
            how="right",
            left_on="Contest_Id",
            right_index=True,
        )
        .rename(columns={"Name": "Contest"})
        .merge(
            df["BallotMeasureSelection"],
            how="left",
            left_on="Selection_Id",
            right_index=True,
        )
    )
    ballotmeasure_df = ballotmeasure_df[columns]
    if ballotmeasure_df.empty:
        ballotmeasure_df["contest_type"] = None
    else:
        ballotmeasure_df = m.add_constant_column(
            ballotmeasure_df, "contest_type", "BallotMeasure"
        )
    return ballotmeasure_df
예제 #2
0
def create_candidate_contests(df, columns):
    contest_df = (df["VoteCount"].merge(
        df["Contest"], how="left", left_on="Contest_Id",
        right_index=True).rename(columns={
            "Name": "Contest",
            "Id": "ContestSelectionJoin_Id"
        }).merge(
            df["CandidateSelection"],
            how="left",
            left_on="Selection_Id",
            right_index=True,
        ).merge(df["Candidate"],
                how="left",
                left_on="Candidate_Id",
                right_index=True).rename(columns={
                    "BallotName": "Selection"
                }).merge(df["CandidateContest"],
                         how="left",
                         left_on="Contest_Id",
                         right_index=True).merge(df["Office"],
                                                 how="left",
                                                 left_on="Office_Id",
                                                 right_index=True))
    contest_df = contest_df[columns]
    if contest_df.empty:
        contest_df["contest_type"] = None
    else:
        contest_df = m.add_constant_column(contest_df, "contest_type",
                                           "Candidate")
    return contest_df
예제 #3
0
def read_multi_sheet_excel(
    f_path: str,
    munger: jm.Munger,
    err: dict,
) -> (pd.DataFrame, dict):
    # get munger parameters
    sheets_to_skip = munger.options["sheets_to_skip"]
    count_of_top_lines_to_skip = munger.options["count_of_top_lines_to_skip"]
    constant_line_count = munger.options["constant_line_count"]
    constant_column_count = munger.options["constant_column_count"]
    header_row_count = munger.options["header_row_count"]
    columns_to_skip = munger.options["columns_to_skip"]

    try:
        df = pd.read_excel(f_path, sheet_name=None, header=None)
    except Exception as e:
        new_err = ui.add_new_error(err, "file",
                                   Path(f_path).name,
                                   f"Error reading file: {e}")
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return pd.DataFrame(), err

    sheets_to_read = [k for k in df.keys() if k not in sheets_to_skip]

    raw_results = pd.DataFrame()
    for sh in sheets_to_read:
        try:
            data = df[sh].copy()

            # remove lines designated ignorable
            data.drop(data.index[:count_of_top_lines_to_skip], inplace=True)

            # remove any all-null rows
            data.dropna(how="all", inplace=True)

            # read constant_line info from first non-null entries of constant-header rows
            # then drop those rows
            if constant_line_count > 0:
                constant_lines = (data.iloc[:constant_line_count].fillna(
                    method="bfill", axis=1).iloc[:, 0])
                data.drop(data.index[:constant_line_count], inplace=True)

            # read constant_column info from first non-null entries of constant columns
            # and drop those columns
            if constant_column_count > 0:
                constant_columns = (data.T.iloc[:constant_column_count].fillna(
                    method="bfill", axis=1).iloc[:, 0])
                data.drop(data.columns[:constant_column_count],
                          axis=1,
                          inplace=True)

            # add multi-index for actual header rows
            header_variable_names = [
                f"header_{j}" for j in range(header_row_count)
            ]

            col_multi_index = pd.MultiIndex.from_frame(
                data.iloc[range(header_row_count), :].transpose().fillna(
                    method="ffill"),
                names=header_variable_names,
            )
            data.columns = col_multi_index

            # remove header rows from data
            data.drop(data.index[:header_row_count], inplace=True)

            # Drop extraneous columns per munger, and columns without data
            data.drop(data.columns[columns_to_skip], axis=1, inplace=True)
            data.dropna(axis=1, how="all", inplace=True)

            # make first column into an index
            data.set_index(keys=data.columns[0], inplace=True)

            # move header info to columns
            data = pd.melt(
                data,
                ignore_index=False,
                value_name="count",
                var_name=header_variable_names,
            )

            # add column(s) for constant info
            for j in range(constant_line_count):
                data = m.add_constant_column(data, f"constant_line_{j}",
                                             constant_lines.iloc[j])
            for j in range(constant_column_count):
                data = m.add_constant_column(data, f"constant_column_{j}",
                                             constant_columns.iloc[j])

            # Make row index (from first column of blocks) into a column called 'first_column'
            data.reset_index(inplace=True)
            data.rename(columns={data.columns[0]: "first_column"},
                        inplace=True)

            raw_results = pd.concat([raw_results, data])
        except Exception as e:
            err = ui.add_new_error(
                err,
                "system",
                "special_formats.read_multi_sheet_excel",
                f"Unexpected exception while processing sheet {sh}: {e}",
            )
    return raw_results, err
예제 #4
0
def read_concatenated_blocks(f_path: str, munger: jm.Munger,
                             err: dict) -> (pd.DataFrame, dict):
    """Assumes first column of each block is ReportingUnit, last column is contest total"""
    try:
        with open(f_path, "r") as f:
            data = f.readlines()
    except Exception as exc:
        err = ui.add_new_error(err, "file", f_path,
                               f"Datafile not read:\n{exc}\n")
        return pd.DataFrame(), err

    # get  munger parameters
    w = munger.options["column_width"]
    tlts = munger.options["count_of_top_lines_to_skip"]
    v_t_cc = munger.options["last_header_column_count"]
    skip_cols = munger.options["columns_to_skip"]

    df = dict()

    # skip lines at top
    data = data[tlts:]

    try:
        while len(data) > 3:
            # TODO allow number & interps of headers to vary?
            # get rid of blank lines
            while data[0] == "\n":
                data.pop(0)

            # get the header lines
            header_0 = data.pop(0).strip()
            header_1 = data.pop(0)
            header_line = data.pop(0)

            # get info from header line
            field_list = extract_items(header_line, w)

            # Add back county header in case of Iowa:
            if header_line.startswith(" " * w):
                field_list = [""] + field_list

            # remove first column header and headers of any columns to be skipped
            last_header = remove_by_index(field_list, [0] + skip_cols)

            # check that the size of the side-to-side repeated block is consistent
            if len(last_header) % v_t_cc != 0:
                e = (
                    f"Count of last header (per munger) ({v_t_cc}) "
                    f"does not evenly divide the number of count columns in the results file "
                    f"({len(last_header)})")
                err = ui.add_new_error(
                    err,
                    "munger",
                    munger.name,
                    e,
                )
                return pd.DataFrame(), err

            # get list from next header row and disambiguate
            # TODO tech debt: disambiguation assumes Candidate formula is <header_1>
            header_1_list, alts = disambiguate(
                extract_items(header_1, w * v_t_cc))

            #  add disambiguated entries to munger's dictionary of alternatives
            if alts:
                if "Candidate" in munger.alt.keys():
                    munger.alt["Candidate"].update(alts)
                else:
                    munger.alt["Candidate"] = alts

            # create df from next batch of lines, with that multi-index
            # find idx of next empty line (or end of data)
            try:
                next_empty = next(idx for idx in range(len(data))
                                  if data[idx] == "\n")
            except StopIteration:
                next_empty = len(data)
            # create io
            vote_count_block = io.StringIO()
            vote_count_block.write("".join(data[:next_empty]))
            vote_count_block.seek(0)

            df[header_0] = pd.read_fwf(vote_count_block,
                                       colspecs="infer",
                                       index=False,
                                       header=None)

            # Drop extraneous columns (per munger). Negative numbers count from right side
            df[header_0].drop(df[header_0].columns[skip_cols],
                              axis=1,
                              inplace=True)

            # make first column into an index
            df[header_0].set_index(keys=[0], inplace=True)

            # add multi-index with header_1 and header_2 info
            index_array = [
                [
                    y for z in [[cand] * v_t_cc for cand in header_1_list]
                    for y in z
                ],
                last_header,
            ]

            # Create map from integer columns to (header_1, header_2) values
            header_map = {}
            for i, col in enumerate(df[header_0].columns):
                header_map[col] = (index_array[0][i], index_array[1][i])

            # Move header to columns
            df[header_0] = pd.melt(
                df[header_0],
                ignore_index=False,
                value_vars=df[header_0].columns.tolist(),
                value_name="count",
                var_name="header_tmp",
            )

            # Gather values for header_1 and header_2 columns.
            header_1_col = [
                header_map[i][0] for i in df[header_0]["header_tmp"]
            ]
            header_2_col = [
                header_map[i][1] for i in df[header_0]["header_tmp"]
            ]

            # Add header_1 and header_2 columns, and remove header_tmp.
            df[header_0]["header_1"] = header_1_col
            df[header_0]["header_2"] = header_2_col
            df[header_0] = df[header_0].drop(columns="header_tmp")

            # Add columns for header_0
            df[header_0] = m.add_constant_column(df[header_0], "header_0",
                                                 header_0)

            # remove processed lines from data
            data = data[next_empty:]
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "warn-munger",
            munger.name,
            f"unparsed lines at bottom of file ({Path(f_path).name}):\n{data}\n",
        )

    # consolidate all into one dataframe
    try:
        raw_results = pd.concat(list(df.values()))
    except ValueError as e:
        err = ui.add_new_error(
            err,
            "munger",
            munger.name,
            f"Error concatenating data from blocks: {e}",
        )
        return pd.DataFrame, err

    # Make row index (from first column of blocks) into a column called 'first_column'
    raw_results.reset_index(inplace=True)
    # TODO tech debt is next line still necessary?
    raw_results.rename(columns={0: "first_column"}, inplace=True)

    return raw_results, err
예제 #5
0
    def load_contests(self, engine, contest_type: str, error: dict) -> dict:
        # read <contest_type>Contests from jurisdiction folder
        element_fpath = os.path.join(self.path_to_juris_dir,
                                     f"{contest_type}Contest.txt")
        if not os.path.exists(element_fpath):
            error[f"{contest_type}Contest.txt"] = "file not found"
            return error
        df = pd.read_csv(element_fpath,
                         sep="\t",
                         encoding="iso-8859-1",
                         quoting=csv.QUOTE_MINIMAL).fillna("none or unknown")

        # add contest_type column
        df = m.add_constant_column(df, "contest_type", contest_type)

        # add 'none or unknown' record
        df = add_none_or_unknown(df, contest_type=contest_type)

        # dedupe df
        dupes, df = ui.find_dupes(df)
        if not dupes.empty:
            print(
                f"WARNING: duplicates removed from dataframe, may indicate a problem.\n"
            )
            if not f"{contest_type}Contest" in error:
                error[f"{contest_type}Contest"] = {}
            error[f"{contest_type}Contest"]["found_duplicates"] = True

        # insert into in Contest table
        e = db.insert_to_cdf_db(engine, df[["Name", "contest_type"]],
                                "Contest")

        # append Contest_Id
        col_map = {"Name": "Name", "contest_type": "contest_type"}
        df = db.append_id_to_dframe(engine, df, "Contest", col_map=col_map)

        if contest_type == "BallotMeasure":
            # append ElectionDistrict_Id, Election_Id
            for fk, ref in [
                ("ElectionDistrict", "ReportingUnit"),
                ("Election", "Election"),
            ]:
                col_map = {fk: "Name"}
                df = (db.append_id_to_dframe(
                    engine, df, ref,
                    col_map=col_map).rename(columns={
                        f"{ref}_Id": f"{fk}_Id"
                    }).drop(fk, axis=1))

        else:
            # append Office_Id, PrimaryParty_Id
            for fk, ref in [("Office", "Office"), ("PrimaryParty", "Party")]:
                col_map = {fk: "Name"}
                df = db.append_id_to_dframe(
                    engine, df, ref,
                    col_map=col_map).rename(columns={f"{ref}_Id": f"{fk}_Id"})

        # create entries in <contest_type>Contest table
        # commit info in df to <contest_type>Contest table to db
        err = db.insert_to_cdf_db(engine,
                                  df.rename(columns={"Contest_Id": "Id"}),
                                  f"{contest_type}Contest")
        if err:
            if f"{contest_type}Contest" not in error:
                error[f"{contest_type}Contest"] = {}
            error[f"{contest_type}Contest"]["database"] = err
        return error