def create_ballot_measure_contests(df, columns): ballotmeasure_df = ( df["ContestSelectionJoin"] .merge( df["BallotMeasureContest"], how="right", left_on="Contest_Id", right_index=True, ) .rename(columns={"Name": "Contest"}) .merge( df["BallotMeasureSelection"], how="left", left_on="Selection_Id", right_index=True, ) ) ballotmeasure_df = ballotmeasure_df[columns] if ballotmeasure_df.empty: ballotmeasure_df["contest_type"] = None else: ballotmeasure_df = m.add_constant_column( ballotmeasure_df, "contest_type", "BallotMeasure" ) return ballotmeasure_df
def create_candidate_contests(df, columns): contest_df = (df["VoteCount"].merge( df["Contest"], how="left", left_on="Contest_Id", right_index=True).rename(columns={ "Name": "Contest", "Id": "ContestSelectionJoin_Id" }).merge( df["CandidateSelection"], how="left", left_on="Selection_Id", right_index=True, ).merge(df["Candidate"], how="left", left_on="Candidate_Id", right_index=True).rename(columns={ "BallotName": "Selection" }).merge(df["CandidateContest"], how="left", left_on="Contest_Id", right_index=True).merge(df["Office"], how="left", left_on="Office_Id", right_index=True)) contest_df = contest_df[columns] if contest_df.empty: contest_df["contest_type"] = None else: contest_df = m.add_constant_column(contest_df, "contest_type", "Candidate") return contest_df
def read_multi_sheet_excel( f_path: str, munger: jm.Munger, err: dict, ) -> (pd.DataFrame, dict): # get munger parameters sheets_to_skip = munger.options["sheets_to_skip"] count_of_top_lines_to_skip = munger.options["count_of_top_lines_to_skip"] constant_line_count = munger.options["constant_line_count"] constant_column_count = munger.options["constant_column_count"] header_row_count = munger.options["header_row_count"] columns_to_skip = munger.options["columns_to_skip"] try: df = pd.read_excel(f_path, sheet_name=None, header=None) except Exception as e: new_err = ui.add_new_error(err, "file", Path(f_path).name, f"Error reading file: {e}") if new_err: err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return pd.DataFrame(), err sheets_to_read = [k for k in df.keys() if k not in sheets_to_skip] raw_results = pd.DataFrame() for sh in sheets_to_read: try: data = df[sh].copy() # remove lines designated ignorable data.drop(data.index[:count_of_top_lines_to_skip], inplace=True) # remove any all-null rows data.dropna(how="all", inplace=True) # read constant_line info from first non-null entries of constant-header rows # then drop those rows if constant_line_count > 0: constant_lines = (data.iloc[:constant_line_count].fillna( method="bfill", axis=1).iloc[:, 0]) data.drop(data.index[:constant_line_count], inplace=True) # read constant_column info from first non-null entries of constant columns # and drop those columns if constant_column_count > 0: constant_columns = (data.T.iloc[:constant_column_count].fillna( method="bfill", axis=1).iloc[:, 0]) data.drop(data.columns[:constant_column_count], axis=1, inplace=True) # add multi-index for actual header rows header_variable_names = [ f"header_{j}" for j in range(header_row_count) ] col_multi_index = pd.MultiIndex.from_frame( data.iloc[range(header_row_count), :].transpose().fillna( method="ffill"), names=header_variable_names, ) data.columns = col_multi_index # remove header rows from data data.drop(data.index[:header_row_count], inplace=True) # Drop extraneous columns per munger, and columns without data data.drop(data.columns[columns_to_skip], axis=1, inplace=True) data.dropna(axis=1, how="all", inplace=True) # make first column into an index data.set_index(keys=data.columns[0], inplace=True) # move header info to columns data = pd.melt( data, ignore_index=False, value_name="count", var_name=header_variable_names, ) # add column(s) for constant info for j in range(constant_line_count): data = m.add_constant_column(data, f"constant_line_{j}", constant_lines.iloc[j]) for j in range(constant_column_count): data = m.add_constant_column(data, f"constant_column_{j}", constant_columns.iloc[j]) # Make row index (from first column of blocks) into a column called 'first_column' data.reset_index(inplace=True) data.rename(columns={data.columns[0]: "first_column"}, inplace=True) raw_results = pd.concat([raw_results, data]) except Exception as e: err = ui.add_new_error( err, "system", "special_formats.read_multi_sheet_excel", f"Unexpected exception while processing sheet {sh}: {e}", ) return raw_results, err
def read_concatenated_blocks(f_path: str, munger: jm.Munger, err: dict) -> (pd.DataFrame, dict): """Assumes first column of each block is ReportingUnit, last column is contest total""" try: with open(f_path, "r") as f: data = f.readlines() except Exception as exc: err = ui.add_new_error(err, "file", f_path, f"Datafile not read:\n{exc}\n") return pd.DataFrame(), err # get munger parameters w = munger.options["column_width"] tlts = munger.options["count_of_top_lines_to_skip"] v_t_cc = munger.options["last_header_column_count"] skip_cols = munger.options["columns_to_skip"] df = dict() # skip lines at top data = data[tlts:] try: while len(data) > 3: # TODO allow number & interps of headers to vary? # get rid of blank lines while data[0] == "\n": data.pop(0) # get the header lines header_0 = data.pop(0).strip() header_1 = data.pop(0) header_line = data.pop(0) # get info from header line field_list = extract_items(header_line, w) # Add back county header in case of Iowa: if header_line.startswith(" " * w): field_list = [""] + field_list # remove first column header and headers of any columns to be skipped last_header = remove_by_index(field_list, [0] + skip_cols) # check that the size of the side-to-side repeated block is consistent if len(last_header) % v_t_cc != 0: e = ( f"Count of last header (per munger) ({v_t_cc}) " f"does not evenly divide the number of count columns in the results file " f"({len(last_header)})") err = ui.add_new_error( err, "munger", munger.name, e, ) return pd.DataFrame(), err # get list from next header row and disambiguate # TODO tech debt: disambiguation assumes Candidate formula is <header_1> header_1_list, alts = disambiguate( extract_items(header_1, w * v_t_cc)) # add disambiguated entries to munger's dictionary of alternatives if alts: if "Candidate" in munger.alt.keys(): munger.alt["Candidate"].update(alts) else: munger.alt["Candidate"] = alts # create df from next batch of lines, with that multi-index # find idx of next empty line (or end of data) try: next_empty = next(idx for idx in range(len(data)) if data[idx] == "\n") except StopIteration: next_empty = len(data) # create io vote_count_block = io.StringIO() vote_count_block.write("".join(data[:next_empty])) vote_count_block.seek(0) df[header_0] = pd.read_fwf(vote_count_block, colspecs="infer", index=False, header=None) # Drop extraneous columns (per munger). Negative numbers count from right side df[header_0].drop(df[header_0].columns[skip_cols], axis=1, inplace=True) # make first column into an index df[header_0].set_index(keys=[0], inplace=True) # add multi-index with header_1 and header_2 info index_array = [ [ y for z in [[cand] * v_t_cc for cand in header_1_list] for y in z ], last_header, ] # Create map from integer columns to (header_1, header_2) values header_map = {} for i, col in enumerate(df[header_0].columns): header_map[col] = (index_array[0][i], index_array[1][i]) # Move header to columns df[header_0] = pd.melt( df[header_0], ignore_index=False, value_vars=df[header_0].columns.tolist(), value_name="count", var_name="header_tmp", ) # Gather values for header_1 and header_2 columns. header_1_col = [ header_map[i][0] for i in df[header_0]["header_tmp"] ] header_2_col = [ header_map[i][1] for i in df[header_0]["header_tmp"] ] # Add header_1 and header_2 columns, and remove header_tmp. df[header_0]["header_1"] = header_1_col df[header_0]["header_2"] = header_2_col df[header_0] = df[header_0].drop(columns="header_tmp") # Add columns for header_0 df[header_0] = m.add_constant_column(df[header_0], "header_0", header_0) # remove processed lines from data data = data[next_empty:] except Exception as exc: err = ui.add_new_error( err, "warn-munger", munger.name, f"unparsed lines at bottom of file ({Path(f_path).name}):\n{data}\n", ) # consolidate all into one dataframe try: raw_results = pd.concat(list(df.values())) except ValueError as e: err = ui.add_new_error( err, "munger", munger.name, f"Error concatenating data from blocks: {e}", ) return pd.DataFrame, err # Make row index (from first column of blocks) into a column called 'first_column' raw_results.reset_index(inplace=True) # TODO tech debt is next line still necessary? raw_results.rename(columns={0: "first_column"}, inplace=True) return raw_results, err
def load_contests(self, engine, contest_type: str, error: dict) -> dict: # read <contest_type>Contests from jurisdiction folder element_fpath = os.path.join(self.path_to_juris_dir, f"{contest_type}Contest.txt") if not os.path.exists(element_fpath): error[f"{contest_type}Contest.txt"] = "file not found" return error df = pd.read_csv(element_fpath, sep="\t", encoding="iso-8859-1", quoting=csv.QUOTE_MINIMAL).fillna("none or unknown") # add contest_type column df = m.add_constant_column(df, "contest_type", contest_type) # add 'none or unknown' record df = add_none_or_unknown(df, contest_type=contest_type) # dedupe df dupes, df = ui.find_dupes(df) if not dupes.empty: print( f"WARNING: duplicates removed from dataframe, may indicate a problem.\n" ) if not f"{contest_type}Contest" in error: error[f"{contest_type}Contest"] = {} error[f"{contest_type}Contest"]["found_duplicates"] = True # insert into in Contest table e = db.insert_to_cdf_db(engine, df[["Name", "contest_type"]], "Contest") # append Contest_Id col_map = {"Name": "Name", "contest_type": "contest_type"} df = db.append_id_to_dframe(engine, df, "Contest", col_map=col_map) if contest_type == "BallotMeasure": # append ElectionDistrict_Id, Election_Id for fk, ref in [ ("ElectionDistrict", "ReportingUnit"), ("Election", "Election"), ]: col_map = {fk: "Name"} df = (db.append_id_to_dframe( engine, df, ref, col_map=col_map).rename(columns={ f"{ref}_Id": f"{fk}_Id" }).drop(fk, axis=1)) else: # append Office_Id, PrimaryParty_Id for fk, ref in [("Office", "Office"), ("PrimaryParty", "Party")]: col_map = {fk: "Name"} df = db.append_id_to_dframe( engine, df, ref, col_map=col_map).rename(columns={f"{ref}_Id": f"{fk}_Id"}) # create entries in <contest_type>Contest table # commit info in df to <contest_type>Contest table to db err = db.insert_to_cdf_db(engine, df.rename(columns={"Contest_Id": "Id"}), f"{contest_type}Contest") if err: if f"{contest_type}Contest" not in error: error[f"{contest_type}Contest"] = {} error[f"{contest_type}Contest"]["database"] = err return error