def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs): if force_date is None: force_date = date_from_str(raw_s3_file) super().__init__(raw_s3_file=raw_s3_file, config_file=config_file, force_date=force_date, **kwargs) self.raw_s3_file = raw_s3_file self.processed_file = None
def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs): # Attempt to get date from file if not specifically given if force_date is None: force_date = date_from_str(raw_s3_file) # Call the main Preprocessor init super().__init__( raw_s3_file=raw_s3_file, config_file=config_file, force_date=force_date, **kwargs, ) # Initialize some properties self.raw_s3_file = raw_s3_file self.processed_file = None
def add_history(main_df): # also save as sparse array since so many elections are stored count_df = pd.DataFrame() for idx, hist in enumerate(self.config["hist_columns"]): unique_codes, counts = np.unique( main_df[hist].str.replace(" ", "_").dropna().values, return_counts=True, ) count_df_new = pd.DataFrame(index=unique_codes, data=counts, columns=["counts_" + hist]) count_df = pd.concat([count_df, count_df_new], axis=1) count_df["total_counts"] = count_df.sum(axis=1) unique_codes = count_df.index.values counts = count_df["total_counts"].values count_order = counts.argsort() unique_codes = unique_codes[count_order] counts = counts[count_order] sorted_codes = unique_codes.tolist() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k), } for i, k in enumerate(sorted_codes) } def insert_code_bin(arr): return [sorted_codes_dict[k]["index"] for k in arr] main_df["all_history"] = main_df[ self.config["hist_columns"]].apply( lambda x: list(x.dropna().str.replace(" ", "_")), axis=1) main_df.all_history = main_df.all_history.map(insert_code_bin) return sorted_codes, sorted_codes_dict
def handle_date(d): possible_date = date_from_str(d) if possible_date is None: return "" return pd.to_datetime(possible_date).strftime("%m/%d/%Y")
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() logging.info("preprocessing florida") # new_files is list of dicts, i.e. [{"name":.. , "obj": <fileobj>}, ..] new_files = self.unpack_files(compression="unzip", file_obj=self.main_file) del self.main_file, self.temp_files gc.collect() vote_history_files = [] voter_files = [] for i in new_files: if "_H_" in i["name"]: vote_history_files.append(i) elif ".txt" in i["name"]: voter_files.append(i) if not self.ignore_checks: self.file_check(len(voter_files)) concat_voter_file = concat_and_delete(voter_files) concat_history_file = concat_and_delete(vote_history_files) del new_files, vote_history_files, voter_files gc.collect() logging.info("FLORIDA: loading voter history file") df_hist = pd.read_fwf(concat_history_file, header=None) try: df_hist.columns = self.config["hist_columns"] except ValueError: logging.info("Incorrect history columns found in Florida") raise MissingNumColumnsError( "{} state history is missing columns".format(self.state), self.state, len(self.config["hist_columns"]), len(df_hist.columns), ) del concat_history_file gc.collect() df_hist = df_hist[df_hist["date"].map(lambda x: len(x)) > 5] df_hist["election_name"] = (df_hist["date"] + "_" + df_hist["election_type"]) valid_elections, counts = np.unique(df_hist["election_name"], return_counts=True) date_order = [ idx for idx, election in sorted( enumerate(valid_elections), key=lambda x: datetime.strptime(x[1][:-4], "%m/%d/%Y"), reverse=True, ) ] valid_elections = valid_elections[date_order] counts = counts[date_order] sorted_codes = valid_elections.tolist() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k) } for i, k in enumerate(sorted_codes) } df_hist["array_position"] = df_hist["election_name"].map( lambda x: int(sorted_codes_dict[x]["index"])) del valid_elections, counts, date_order gc.collect() logging.info("FLORIDA: history apply") voter_groups = df_hist.groupby("VoterID") all_history = voter_groups["array_position"].apply(list) vote_type = voter_groups["vote_type"].apply(list) del voter_groups, df_hist gc.collect() logging.info("FLORIDA: loading main voter file") df_voters = self.read_csv_count_error_lines(concat_voter_file, header=None, sep="\t", error_bad_lines=False) del concat_voter_file gc.collect() try: df_voters.columns = self.config["ordered_columns"] except ValueError: logging.info("Incorrect number of columns found for Flordia") raise MissingNumColumnsError( "{} state is missing voters columns".format(self.state), self.state, len(self.config["ordered_columns"]), len(df_voters.columns), ) df_voters = df_voters.set_index(self.config["voter_id"]) df_voters["all_history"] = all_history df_voters["vote_type"] = vote_type del all_history, vote_type gc.collect() df_voters = self.config.coerce_strings(df_voters) df_voters = self.config.coerce_dates(df_voters) df_voters = self.config.coerce_numeric( df_voters, extra_cols=[ "Precinct", "Precinct_Split", "Daytime_Phone_Number", "Daytime_Area_Code", "Daytime_Phone_Extension", "Daytime_Area_Code", "Daytime_Phone_Extension", "Mailing_Zipcode", "Residence_Zipcode", "Mailing_Address_Line_1", "Mailing_Address_Line_2", "Mailing_Address_Line_3", "Residence_Address_Line_1", "Residence_Address_Line_2", ], ) self.meta = { "message": "florida_{}".format(datetime.now().isoformat()), "array_encoding": sorted_codes_dict, "array_decoding": sorted_codes, } csv_obj = df_voters.to_csv(encoding="utf-8") del df_voters gc.collect() logging.info("FLORIDA: writing out") self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(csv_obj), s3_bucket=self.s3_bucket, ) del csv_obj gc.collect()
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() # config = Config('michigan') new_files = self.unpack_files(file_obj=self.main_file) del self.main_file, self.temp_files gc.collect() if not self.ignore_checks: self.file_check(len(new_files)) voter_file = ([ n for n in new_files if "entire_state_v" in n["name"] or "EntireStateVoters" in n["name"] ] + [None])[0] hist_file = ([ n for n in new_files if "entire_state_h" in n["name"] or "EntireStateVoterHistory" in n["name"] ] + [None])[0] elec_codes = ([n for n in new_files if "electionscd" in n["name"]] + [None])[0] logging.info("Loading voter file: " + voter_file["name"]) if voter_file["name"][-3:] == "lst": vcolspecs = [ [0, 35], [35, 55], [55, 75], [75, 78], [78, 82], [82, 83], [83, 91], [91, 92], [92, 99], [99, 103], [103, 105], [105, 135], [135, 141], [141, 143], [143, 156], [156, 191], [191, 193], [193, 198], [198, 248], [248, 298], [298, 348], [348, 398], [398, 448], [448, 461], [461, 463], [463, 468], [468, 474], [474, 479], [479, 484], [484, 489], [489, 494], [494, 499], [499, 504], [504, 510], [510, 516], [516, 517], [517, 519], ] vdf = pd.read_fwf( voter_file["obj"], colspecs=vcolspecs, names=self.config["fwf_voter_columns"], na_filter=False, ) elif voter_file["name"][-3:] == "csv": vdf = self.read_csv_count_error_lines( voter_file["obj"], encoding="latin-1", na_filter=False, error_bad_lines=False, ) # rename 'STATE' field to not conflict with our 'state' field vdf.rename(columns={"STATE": "STATE_ADDR"}, inplace=True) else: raise NotImplementedError("File format not implemented") del voter_file gc.collect() def column_is_empty(col): total_size = col.shape[0] if (sum(col.isna()) == total_size) or (sum(col == "")): return True return False def fill_empty_columns(df): # Dummy values for newly added data fields if column_is_empty(df["STATUS_USER_CODE"]): df["STATUS_USER_CODE"] = "_" if column_is_empty(df["VOTER_ID"]): df["VOTER_ID"] = 0 if column_is_empty(df["STATUS_DATE"]): df["STATUS_DATE"] = "1970-01-01 00:00:00" return df vdf = self.reconcile_columns(vdf, self.config["columns"]) vdf = fill_empty_columns(vdf) vdf = vdf.reindex(columns=self.config["ordered_columns"]) vdf[self.config["party_identifier"]] = "npa" logging.info("Loading history file: " + hist_file["name"]) if hist_file["name"][-3:] == "lst": hcolspecs = [ [0, 13], [13, 15], [15, 20], [20, 25], [25, 38], [38, 39], ] hdf = pd.read_fwf( hist_file["obj"], colspecs=hcolspecs, names=self.config["fwf_hist_columns"], na_filter=False, ) elif hist_file["name"][-3:] == "csv": hdf = self.read_csv_count_error_lines(hist_file["obj"], na_filter=False, error_bad_lines=False) if ("IS_ABSENTEE_VOTER" not in hdf.columns) and ("IS_PERMANENT_ABSENTEE_VOTER" in hdf.columns): hdf.rename( columns={ "IS_PERMANENT_ABSENTEE_VOTER": "IS_ABSENTEE_VOTER" }, inplace=True, ) else: raise NotImplementedError("File format not implemented") del hist_file gc.collect() # If hdf has ELECTION_DATE (new style) instead of ELECTION_CODE, # then we don't need to do election code lookups elec_code_dict = dict() missing_history_dates = False if "ELECTION_DATE" in hdf.columns: try: hdf["ELECTION_NAME"] = pd.to_datetime( hdf["ELECTION_DATE"]).map(lambda x: x.strftime("%Y-%m-%d")) except ValueError: missing_history_dates = True hdf["ELECTION_NAME"] = hdf["ELECTION_DATE"] else: if elec_codes: # If we have election codes in this file logging.info("Loading election codes file: " + elec_codes["name"]) if elec_codes["name"][-3:] == "lst": ecolspecs = [[0, 13], [13, 21], [21, 46]] edf = pd.read_fwf( elec_codes["obj"], colspecs=ecolspecs, names=self.config["elec_code_columns"], na_filter=False, ) edf["Date"] = pd.to_datetime(edf["Date"], format="%m%d%Y") elif elec_codes["name"][-3:] == "csv": # I'm not sure if this would actually ever happen edf = self.read_csv_count_error_lines( elec_codes["obj"], names=self.config["elec_code_columns"], na_filter=False, error_bad_lines=False, ) else: raise NotImplementedError("File format not implemented") # make a code dictionary that will be stored with meta data for idx, row in edf.iterrows(): d = row["Date"].strftime("%Y-%m-%d") elec_code_dict[row["Election_Code"]] = { "Date": d, "Slug": d + "_" + str(row["Election_Code"]) + "_" + row["Title"].replace(" ", "-").replace("_", "-"), } else: # Get election codes from most recent meta data this_date = parser.parse(date_from_str( self.raw_s3_file)).date() pre_date, post_date, pre_key, post_key = get_surrounding_dates( this_date, self.state, self.s3_bucket, testing=self.testing) if pre_key is not None: nearest_meta = get_metadata_for_key( pre_key, self.s3_bucket) elec_code_dict = nearest_meta["elec_code_dict"] if len(elec_code_dict) == 0: raise MissingElectionCodesError( "No election codes in nearby meta data.") else: raise MissingElectionCodesError( "No election code file or nearby meta data found.") # Election code lookup hdf["ELECTION_NAME"] = hdf["ELECTION_CODE"].map( lambda x: elec_code_dict[str(x)]["Slug"] if str(x) in elec_code_dict else str(x)) # Create meta data counts = hdf["ELECTION_NAME"].value_counts() counts.sort_index(inplace=True) sorted_codes = counts.index.to_list() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k) } for i, k in enumerate(sorted_codes) } # Collect histories vdf.set_index(self.config["voter_id"], drop=False, inplace=True) hdf_id_groups = hdf.groupby(self.config["voter_id"]) vdf["all_history"] = hdf_id_groups["ELECTION_NAME"].apply(list) vdf["votetype_history"] = hdf_id_groups["IS_ABSENTEE_VOTER"].apply( list) vdf["county_history"] = hdf_id_groups["COUNTY_CODE"].apply(list) vdf["jurisdiction_history"] = hdf_id_groups["JURISDICTION_CODE"].apply( list) vdf["schooldistrict_history"] = hdf_id_groups[ "SCHOOL_DISTRICT_CODE"].apply(list) del hdf, hdf_id_groups gc.collect() def insert_code_bin(arr): if isinstance(arr, list): return [ sorted_codes_dict[k]["index"] for k in arr if k in sorted_codes_dict ] else: return np.nan vdf["sparse_history"] = vdf["all_history"].map(insert_code_bin) if missing_history_dates: vdf["all_history"] = None vdf["sparse_history"] = None vdf = self.config.coerce_dates(vdf) vdf = self.config.coerce_numeric( vdf, extra_cols=[ "PRECINCT", "WARD", "VILLAGE_PRECINCT", "SCHOOL_PRECINCT", ], ) vdf = self.config.coerce_strings(vdf) self.meta = { "message": "michigan_{}".format(datetime.now().isoformat()), "array_encoding": sorted_codes_dict, "array_decoding": sorted_codes, "elec_code_dict": elec_code_dict, } csv_obj = vdf.to_csv(encoding="utf-8", index=False) del vdf gc.collect() self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(csv_obj), s3_bucket=self.s3_bucket, ) del csv_obj gc.collect()
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() logging.info("Minnesota: loading voter file") new_files = self.unpack_files(compression="unzip", file_obj=self.main_file) if not self.ignore_checks: self.file_check(len(new_files)) voter_reg_df = pd.DataFrame(columns=self.config["ordered_columns"]) voter_hist_df = pd.DataFrame(columns=self.config["hist_columns"]) for i in new_files: if "election" in i["name"].lower(): voter_hist_df = pd.concat( [ voter_hist_df, self.read_csv_count_error_lines(i["obj"], error_bad_lines=False), ], axis=0, ) elif "voter" in i["name"].lower(): voter_reg_df = pd.concat( [ voter_reg_df, self.read_csv_count_error_lines(i["obj"], encoding="latin-1", error_bad_lines=False), ], axis=0, ) voter_reg_df[self.config["voter_status"]] = np.nan voter_reg_df[self.config["party_identifier"]] = np.nan # if the dataframes are assigned columns to begin with, there will be nans due to concat if the columns are off self.column_check(list(voter_reg_df.columns)) voter_reg_df["DOBYear"] = voter_reg_df["DOBYear"].astype(str).str[0:4] voter_hist_df["election_name"] = (voter_hist_df["ElectionDate"] + "_" + voter_hist_df["VotingMethod"]) valid_elections, counts = np.unique(voter_hist_df["election_name"], return_counts=True) date_order = [ idx for idx, election in sorted( enumerate(valid_elections), key=lambda x: datetime.strptime(x[1][:-2], "%m/%d/%Y"), reverse=True, ) ] valid_elections = valid_elections[date_order] counts = counts[date_order] sorted_codes = valid_elections.tolist() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k) } for i, k in enumerate(sorted_codes) } voter_hist_df["array_position"] = voter_hist_df["election_name"].map( lambda x: int(sorted_codes_dict[x]["index"])) logging.info("Minnesota: history apply") voter_groups = voter_hist_df.groupby("VoterId") all_history = voter_groups["array_position"].apply(list) vote_type = voter_groups["VotingMethod"].apply(list) voter_reg_df = voter_reg_df.set_index(self.config["voter_id"]) voter_reg_df["all_history"] = all_history voter_reg_df["vote_type"] = vote_type gc.collect() voter_reg_df = self.config.coerce_strings(voter_reg_df) voter_reg_df = self.config.coerce_dates(voter_reg_df) voter_reg_df = self.config.coerce_numeric(voter_reg_df) self.meta = { "message": "minnesota_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } gc.collect() logging.info("Minnesota: writing out") self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(voter_reg_df.to_csv(encoding="utf-8")), s3_bucket=self.s3_bucket, )
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() def file_is_active(filename): for word in ["Canceled", "Suspense", "Inactive"]: if word in filename: return False return True def add_files_to_main_df(main_df, file_list): alias_dict = self.config["column_aliases"] for f in file_list: if f["name"].split(".")[-1] == "csv": new_df = self.read_csv_count_error_lines( f["obj"], error_bad_lines=False) else: new_df = pd.read_excel(f["obj"]) for c in new_df.columns: # files vary in consistent use of spaces in headers, # and some headers have different aliases for headers if c.replace(" ", "") in alias_dict.keys(): new_df.rename( columns={c: alias_dict[c.replace(" ", "")]}, inplace=True, ) else: new_df.rename(columns={c: c.replace(" ", "")}, inplace=True) new_df.rename(columns={"YearofBirth": "DOB"}, inplace=True) main_df = pd.concat([main_df, new_df], sort=False) return main_df def insert_code_bin(arr): return [sorted_codes_dict[k]["index"] for k in arr] new_files = self.unpack_files(file_obj=self.main_file, compression="unzip") active_files = [f for f in new_files if file_is_active(f["name"])] other_files = [f for f in new_files if not file_is_active(f["name"])] main_df = pd.DataFrame() main_df = add_files_to_main_df(main_df, active_files) main_df = add_files_to_main_df(main_df, other_files) main_df.reset_index(drop=True, inplace=True) main_df = self.config.coerce_dates(main_df) main_df = self.config.coerce_strings(main_df) main_df = self.config.coerce_numeric( main_df, extra_cols=[ "HouseNumber", "UnitNumber", "ResidenceZip", "MailingZip", "Phone", "PrecinctPart", "VRAZVoterID", ], ) voter_columns = [c for c in main_df.columns if not c[0].isdigit()] history_columns = [c for c in main_df.columns if c[0].isdigit()] self.column_check(voter_columns) to_normalize = history_columns + [ self.config["party_identifier"], self.config["voter_status"], ] for c in to_normalize: s = main_df[c].astype(str).str.strip().str.lower() s = s.str.encode("utf-8", errors="ignore").str.decode("utf-8") main_df.loc[(~main_df[c].isna()), c] = s.loc[(~main_df[c].isna())] for c in history_columns: main_df[c] = main_df[c].str.replace(" - ", "_") main_df[self.config["party_identifier"]] = main_df[ self.config["party_identifier"]].map( lambda x: self.config["party_aliases"][x] if x in self.config["party_aliases"] else x) # handle history: sorted_codes = history_columns[::-1] hist_df = main_df[sorted_codes] voter_df = main_df[voter_columns] counts = (~hist_df.isna()).sum() sorted_codes_dict = { k: { "index": int(i), "count": int(counts[i]), "date": date_from_str(k), } for i, k in enumerate(sorted_codes) } hist_df.loc[:, "vote_codes"] = pd.Series(hist_df.values.tolist()) hist_df.loc[:, "vote_codes"] = hist_df.loc[:, "vote_codes"].map( lambda x: [c for c in x if not pd.isna(c)]) voter_df.loc[:, "votetype_history"] = hist_df.loc[:, "vote_codes"].map( lambda x: [c.split("_")[0] for c in x]) voter_df.loc[:, "party_history"] = hist_df.loc[:, "vote_codes"].map( lambda x: [ c.split("_")[1] if len(c.split("_")) > 1 else self.config[ "no_party_affiliation"] for c in x ]) hist_df.drop(columns=["vote_codes"], inplace=True) for c in hist_df.columns: hist_df.loc[:, c] = hist_df.loc[:, c].map( lambda x: c if not pd.isna(x) else np.nan) voter_df.loc[:, "all_history"] = pd.Series(hist_df.values.tolist()) voter_df.loc[:, "all_history"] = voter_df.loc[:, "all_history"].map( lambda x: [c for c in x if not pd.isna(c)]) voter_df.loc[:, "sparse_history"] = voter_df.loc[:, "all_history"].map( insert_code_bin) expected_cols = (self.config["ordered_columns"] + self.config["ordered_generated_columns"]) voter_df = self.reconcile_columns(voter_df, expected_cols) voter_df = voter_df[expected_cols] self.meta = { "message": "arizona2_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(voter_df.to_csv(encoding="utf-8", index=False)), s3_bucket=self.s3_bucket, )
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() new_files = self.unpack_files(self.main_file, compression="unzip") if not self.ignore_checks: self.file_check(len(new_files)) voter_file = ( new_files[0] if "ElgbVtr" in new_files[0]["name"] else new_files[1] ) hist_file = ( new_files[0] if "VtHst" in new_files[0]["name"] else new_files[1] ) df_hist = self.read_csv_count_error_lines( hist_file["obj"], header=None, error_bad_lines=False ) df_hist.columns = self.config["hist_columns"] df_voters = self.read_csv_count_error_lines( voter_file["obj"], header=None, error_bad_lines=False ) del self.main_file, self.temp_files, voter_file, hist_file, new_files gc.collect() try: df_voters.columns = self.config["ordered_columns"] except ValueError: logging.info("Incorrect number of columns found for Nevada") raise MissingNumColumnsError( "{} state is missing columns".format(self.state), self.state, len(self.config["ordered_columns"]), len(df_voters.columns), ) sorted_codes = df_hist.date.unique().tolist() sorted_codes.sort(key=lambda x: datetime.strptime(x, "%m/%d/%Y")) counts = df_hist.date.value_counts() sorted_codes_dict = { k: { "index": i, "count": int(counts.loc[k]), "date": date_from_str(k), } for i, k in enumerate(sorted_codes) } def insert_code_bin(arr): if isinstance(arr, list): return [sorted_codes_dict[k]["index"] for k in arr] else: return np.nan df_voters = df_voters.set_index("VoterID", drop=False) voter_id_groups = df_hist.groupby("VoterID") df_voters["all_history"] = voter_id_groups["date"].apply(list) df_voters["votetype_history"] = voter_id_groups["vote_code"].apply( list ) del df_hist, voter_id_groups gc.collect() df_voters["sparse_history"] = df_voters["all_history"].map( insert_code_bin ) # create compound string for unique voter ID from county ID df_voters["County_Voter_ID"] = ( df_voters["County"].str.replace(" ", "").str.lower() + "_" + df_voters["County_Voter_ID"].astype(int).astype(str) ) df_voters = self.config.coerce_dates(df_voters) df_voters = self.config.coerce_numeric( df_voters, extra_cols=[ "Zip", "Phone", "Congressional_District", "Senate_District", "Assembly_District", "Education_District", "Regent_District", "Registered_Precinct", ], ) df_voters = self.config.coerce_strings(df_voters) # standardize district data - over time these have varied from: # "1" vs. "district 1" vs "cd1"/"sd1"/"ad1" digits = re.compile("\d+") def get_district_number_str(x): try: s = digits.search(x) except TypeError: return None if s is not None: return s.group() else: return None df_voters["Congressional_District"] = ( df_voters["Congressional_District"].map(ensure_int_string) ) df_voters["Senate_District"] = ( df_voters["Senate_District"].map(ensure_int_string) ) df_voters["Assembly_District"] = ( df_voters["Assembly_District"].map(ensure_int_string) ) df_voters["Congressional_District"] = ( df_voters["Congressional_District"].map(get_district_number_str) ) df_voters["Senate_District"] = ( df_voters["Senate_District"].map(get_district_number_str) ) df_voters["Assembly_District"] = ( df_voters["Assembly_District"].map(get_district_number_str) ) self.meta = { "message": "nevada_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } csv_obj = df_voters.to_csv(encoding="utf-8", index=False) del df_voters gc.collect() self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(csv_obj), s3_bucket=self.s3_bucket, ) del csv_obj gc.collect()
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() new_files = self.unpack_files(file_obj=self.main_file) if not self.ignore_checks: self.file_check(len(new_files)) for i in new_files: logging.info("Loading file {}".format(i)) if "_22" in i["name"]: df = self.read_csv_count_error_lines( i["obj"], encoding="latin-1", compression="gzip", error_bad_lines=False, ) elif ".txt" in i["name"]: temp_df = self.read_csv_count_error_lines( i["obj"], encoding="latin-1", compression="gzip", error_bad_lines=False, ) df = pd.concat([df, temp_df], axis=0) # create history meta data voting_history_cols = list( filter( lambda x: any( [pre in x for pre in ("GENERAL-", "SPECIAL-", "PRIMARY-")] ), df.columns.values, ) ) self.column_check(list(set(df.columns) - set(voting_history_cols))) total_records = df.shape[0] sorted_codes = voting_history_cols sorted_codes_dict = { k: { "index": i, "count": int(total_records - df[k].isna().sum()), "date": date_from_str(k), } for i, k in enumerate(voting_history_cols) } # ensure district and other numeric fields are e.g. "1" not "1.0" df["CONGRESSIONAL_DISTRICT"] = ( df["CONGRESSIONAL_DISTRICT"].map(ensure_int_string) ) df["STATE_REPRESENTATIVE_DISTRICT"] = ( df["STATE_REPRESENTATIVE_DISTRICT"].map(ensure_int_string) ) df["STATE_SENATE_DISTRICT"] = ( df["STATE_SENATE_DISTRICT"].map(ensure_int_string) ) df["COURT_OF_APPEALS"] = ( df["COURT_OF_APPEALS"].map(ensure_int_string) ) df["STATE_BOARD_OF_EDUCATION"] = ( df["STATE_BOARD_OF_EDUCATION"].map(ensure_int_string) ) df["RESIDENTIAL_ZIP"] = ( df["RESIDENTIAL_ZIP"].map(ensure_int_string) ) df["RESIDENTIAL_ZIP_PLUS4"] = ( df["RESIDENTIAL_ZIP_PLUS4"].map(ensure_int_string) ) df["MAILING_ZIP"] = ( df["MAILING_ZIP"].map(ensure_int_string) ) df["MAILING_ZIP_PLUS4"] = ( df["MAILING_ZIP_PLUS4"].map(ensure_int_string) ) self.meta = { "message": "ohio_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(df.to_csv(encoding="utf-8", index=False)), s3_bucket=self.s3_bucket, )
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() new_files = self.unpack_files(compression="unzip", file_obj=self.main_file) df_voter = pd.DataFrame(columns=self.config.raw_file_columns()) df_hist = pd.DataFrame(columns=self.config["hist_columns"]) df_master_voter = pd.DataFrame( columns=self.config["master_voter_columns"]) master_vf_version = True def master_to_reg_df(df): df.columns = self.config["master_voter_columns"] df["STATUS"] = df["VOTER_STATUS"] df["PRECINCT"] = df["PRECINCT_CODE"] df["VOTER_NAME"] = (df["LAST_NAME"] + ", " + df["FIRST_NAME"] + " " + df["MIDDLE_NAME"]) df = pd.concat( [df, pd.DataFrame(columns=self.config["blacklist_columns"])]) df = df[self.config.processed_file_columns()] return df for i in new_files: if "Registered_Voters_List" in i["name"]: master_vf_version = False for i in new_files: if "Public" not in i["name"]: if ("Registered_Voters_List" in i["name"] and not master_vf_version): logging.info("reading in {}".format(i["name"])) # Colorado has a couple different encodings they send us, the format that is detected as ascii will # error out if not read in as latin-1 # The format that is typically detected as utf-8-sig needs to have the index col explicitly set to # false, or else pandas will attempt to read the voterid column # in as the index and the history won't apply encoding_result = chardet.detect(i["obj"].read(10000)) if encoding_result["encoding"] == "ascii": encoding = "latin-1" index_col = None else: encoding = encoding_result["encoding"] index_col = False i["obj"].seek(0) df_voter = pd.concat( [ df_voter, self.read_csv_count_error_lines( i["obj"], encoding=encoding, error_bad_lines=False, index_col=index_col, ), ], axis=0, ) elif ("Voting_History" in i["name"]) or ("Coordinated_Voter_Details" in i["name"]): if "Voter_Details" not in i["name"]: logging.info("reading in {}".format(i["name"])) new_df = self.read_csv_count_error_lines( i["obj"], compression="gzip", error_bad_lines=False) df_hist = pd.concat([df_hist, new_df], axis=0) if "Voter_Details" in i["name"] and master_vf_version: logging.info("reading in {}".format(i["name"])) new_df = self.read_csv_count_error_lines( i["obj"], compression="gzip", error_bad_lines=False) if len(new_df.columns) < len( self.config["master_voter_columns"]): new_df.insert(10, "PHONE_NUM", np.nan) try: new_df.columns = self.config[ "master_voter_columns"] except ValueError: logging.info( "Incorrect number of columns found for Colorado for file: {}" .format(i["name"])) raise MissingNumColumnsError( "{} state is missing columns".format( self.state), self.state, len(self.config["master_voter_columns"]), len(new_df.columns), ) df_master_voter = pd.concat([df_master_voter, new_df], axis=0) if df_voter.empty: df_voter = master_to_reg_df(df_master_voter) if df_hist.empty: raise ValueError("must supply a file containing voter history") df_hist["VOTING_METHOD"] = df_hist["VOTING_METHOD"].replace(np.nan, "") df_hist["ELECTION_DATE"] = pd.to_datetime(df_hist["ELECTION_DATE"], format="%m/%d/%Y", errors="coerce") df_hist.dropna(subset=["ELECTION_DATE"], inplace=True) df_hist["election_name"] = (df_hist["ELECTION_DATE"].astype(str) + "_" + df_hist["VOTING_METHOD"]) valid_elections, counts = np.unique(df_hist["election_name"], return_counts=True) date_order = [ idx for idx, election in sorted( enumerate(valid_elections), key=lambda x: datetime.strptime(x[1][0:10], "%Y-%m-%d"), reverse=True, ) ] valid_elections = valid_elections[date_order] counts = counts[date_order] sorted_codes = valid_elections.tolist() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k) } for i, k in enumerate(sorted_codes) } df_hist["array_position"] = df_hist["election_name"].map( lambda x: int(sorted_codes_dict[x]["index"])) logging.info("Colorado: history apply") voter_groups = df_hist.groupby(self.config["voter_id"]) all_history = voter_groups["array_position"].apply(list) vote_type = voter_groups["VOTING_METHOD"].apply(list) df_voter.dropna(subset=[self.config["voter_id"]], inplace=True) df_voter = df_voter.set_index(self.config["voter_id"]) df_voter.sort_index(inplace=True) df_voter["all_history"] = all_history df_voter["vote_type"] = vote_type gc.collect() # at some point mailing address field names changed for num in ["1", "2", "3"]: if f"MAIL_ADDR{num}" in df_voter.columns: # if both are present, combine them if f"MAILING_ADDRESS_{num}" in df_voter.columns: df_voter[f"MAILING_ADDRESS_{num}"] = np.where( df_voter[f"MAILING_ADDRESS_{num}"].isnull(), df_voter[f"MAIL_ADDR{num}"], df_voter[f"MAILING_ADDRESS_{num}"], ) else: df_voter[f"MAILING_ADDRESS_{num}"] = df_voter[ f"MAIL_ADDR{num}"] df_voter.drop(columns=[f"MAIL_ADDR{num}"], inplace=True) df_voter = self.config.coerce_strings(df_voter) df_voter = self.config.coerce_dates(df_voter) df_voter = self.config.coerce_numeric( df_voter, extra_cols=[ "HOUSE_NUM", "UNIT_NUM", "RESIDENTIAL_ZIP_CODE", "RESIDENTIAL_ZIP_PLUS", "MAILING_ZIP_CODE", "MAILING_ZIP_PLUS", "PRECINCT_NAME", "PRECINCT", "MAILING_ADDRESS_3", "PHONE_NUM", ], ) self.meta = { "message": "Colorado_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } gc.collect() logging.info("Colorado: writing out") self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(df_voter.to_csv(encoding="utf-8")), s3_bucket=self.s3_bucket, )
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() def is_first_file(fname): if "CD1" in fname: if ("Part1" in fname) or ("Part 1" in fname): return True return False new_files = self.unpack_files( file_obj=self.main_file, compression="unzip" ) logging.info("IOWA: reading in voter file") first_file = [f for f in new_files if is_first_file(f["name"])][0] remaining_files = [ f for f in new_files if not is_first_file(f["name"]) ] if not self.ignore_checks: # add 1 for firs file valid_files = len(remaining_files) + 1 self.file_check(valid_files) buffer_cols = [ "buffer0", "buffer1", "buffer2", "buffer3", "buffer4", "buffer5", "buffer6", "buffer7", "buffer8", "buffer9", ] # Reads the headers in on the first file given headers = pd.read_csv(first_file["obj"], nrows=1).columns # Gather the columns for renaming in order to fit the original schema in the database and then rename # so that the columns in the header will fit what is expected column_rename_dict = self.config["rename_columns"] normalized_headers = [ x if x not in column_rename_dict else column_rename_dict[x] for x in headers ] normalized_headers = [x.replace(" ", "_") for x in normalized_headers] columns_to_check = [ x for x in normalized_headers if x not in self.config["election_columns"] ] self.column_check(columns_to_check) # Add the buffer columns back in for lines that contain extra commas headers_with_buffers = normalized_headers + buffer_cols # Begin reading the file with the correct headers df_voters = self.read_csv_count_error_lines( first_file["obj"], skiprows=1, header=None, names=headers_with_buffers, error_bad_lines=False, ) for i in remaining_files: skiprows = 1 if "Part1" in i["name"] else 0 new_df = self.read_csv_count_error_lines( i["obj"], header=None, skiprows=skiprows, names=headers_with_buffers, error_bad_lines=False, ) df_voters = pd.concat([df_voters, new_df], axis=0) key_delim = "_" df_voters["all_history"] = "" df_voters = df_voters[df_voters.COUNTY != "COUNTY"] # instead of iterating over all of the columns for each row, we should # handle all this beforehand. # also we should not compute the unique values until after, not before df_voters.drop(columns=buffer_cols, inplace=True) for c in self.config["election_dates"]: null_rows = df_voters[c].isnull() df_voters[c][null_rows] = "" # each key contains info from the columns prefix = c.split("_")[0] + key_delim # and the corresponding votervotemethod column vote_type_col = c.replace("ELECTION_DATE", "VOTERVOTEMETHOD") null_rows = df_voters[vote_type_col].isnull() df_voters[vote_type_col].loc[null_rows] = "" # add election type and date df_voters[c] = prefix + df_voters[c].str.strip() # add voting method df_voters[c] += key_delim + df_voters[vote_type_col].str.strip() # the code below will format each key as # <election_type>_<date>_<voting_method>_<political_party>_ # <political_org> if "PRIMARY" in prefix: # so far so good but we need more columns in the event of a # primary org_col = c.replace( "PRIMARY_ELECTION_DATE", "POLITICAL_ORGANIZATION" ) party_col = c.replace( "PRIMARY_ELECTION_DATE", "POLITICAL_PARTY" ) df_voters[org_col].loc[df_voters[org_col].isnull()] = "" df_voters[party_col].loc[df_voters[party_col].isnull()] = "" party_info = ( df_voters[party_col].str.strip() + key_delim + df_voters[org_col].str.replace(" ", "") ) df_voters[c] += key_delim + party_info else: # add 'blank' values for the primary slots df_voters[c] += key_delim + key_delim df_voters[c] = df_voters[c].str.replace(prefix + key_delim * 3, "") df_voters[c] = df_voters[c].str.replace('"', "") df_voters[c] = df_voters[c].str.replace("'", "") df_voters.all_history += " " + df_voters[c] # make into an array (null values are '' so they are ignored) df_voters.all_history = df_voters.all_history.str.split() elections, counts = np.unique( df_voters[self.config["election_dates"]], return_counts=True ) # we want reverse order (lower indices are higher frequency) count_order = counts.argsort()[::-1] elections = elections[count_order] counts = counts[count_order] # create meta sorted_codes_dict = { j: {"index": i, "count": int(counts[i]), "date": date_from_str(j)} for i, j in enumerate(elections) } default_item = {"index": len(elections)} def ins_code_bin(a): return [sorted_codes_dict.get(k, default_item)["index"] for k in a] # In an instance like this, where we've created our own systematized # labels for each election I think it makes sense to also keep them # in addition to the sparse history df_voters["sparse_history"] = df_voters.all_history.apply(ins_code_bin) self.meta = { "message": "iowa_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(elections.tolist()), } for c in df_voters.columns: df_voters[c].loc[df_voters[c].isnull()] = "" for c in df_voters.columns: df_voters[c] = ( df_voters[c] .astype(str) .str.encode("utf-8", errors="ignore") .str.decode("utf-8") ) df_voters = self.config.coerce_dates(df_voters) df_voters = self.config.coerce_numeric( df_voters, extra_cols=[ "COMMUNITY_COLLEGE", "COMMUNITY_COLLEGE_DIRECTOR", "LOSST_CONTIGUOUS_CITIES", "PRECINCT", "SANITARY", "SCHOOL_DIRECTOR", "UNIT_NUM", ], ) # force reg num to be integer df_voters["REGN_NUM"] = pd.to_numeric( df_voters["REGN_NUM"], errors="coerce" ).fillna(0) df_voters["REGN_NUM"] = df_voters["REGN_NUM"].astype(int) # Drop the election columns because they are no longer needed df_voters.drop(columns=self.config["election_columns"], inplace=True) self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(df_voters.to_csv(encoding="utf-8", index=False)), s3_bucket=self.s3_bucket, )
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() new_files = self.unpack_files(self.main_file) precincts_file = [ x for x in new_files if 'precincts' in x["name"].lower() ][0] if precincts_file is None: raise ValueError("Missing Precincts File") voter_files = list( filter(lambda v: re.search('cty[0-9]+_vr.csv', v["name"].lower()), new_files)) self.file_check(len(voter_files) + 1) hist_files = list( filter(lambda v: re.search('cty[0-9]+_vh.csv', v["name"].lower()), new_files)) vdf = pd.DataFrame() hdf = pd.DataFrame() dtypes = self.config['dtypes'] cty_map = dict([(value, key) for key, value in self.config['county_codes'].items()]) # Returns the string county name for the county code contained in the first two characters of the precicnct string def county_map(pct): def mapping(prec): county = cty_map[prec[:2]] return county return pd.Series(map(mapping, pct.tolist())) for file in voter_files: if "vr.csv" in file["name"].lower(): temp_vdf = pd.read_csv(file["obj"], encoding='latin', dtype=dtypes) vdf = pd.concat([vdf, temp_vdf], ignore_index=True) vdf.drop_duplicates(inplace=True) # Read and merge the precincts file to the main df precinct_dtypes = { 'PrecinctCode': 'string', 'CongressionalDistrict': 'int64', 'StateSenateDistrict': 'int64', 'StateHouseDistrict': 'int64', 'CountyCommissioner': 'int64', 'PollSite': 'string' } precincts = pd.read_csv(precincts_file["obj"], encoding='latin', dtype=precinct_dtypes) precincts.rename(columns={"PrecinctCode": "Precinct"}, inplace=True) if precincts.empty: raise ValueError("Missing Precicnts file") vdf = vdf.merge(precincts, how='left', on='Precinct') # Add the county column vdf['County'] = county_map(vdf['Precinct']) # At one point OK added some columns, this adds them to older files for backwards compatibility self.reconcile_columns(vdf, self.config["columns"]) for file in hist_files: temp_hdf = pd.read_csv(file["obj"], dtype={'VoterID': 'string'}) hdf = pd.concat( [hdf, temp_hdf], ignore_index=True, ) valid_elections, counts = np.unique(hdf["ElectionDate"], return_counts=True) count_order = counts.argsort()[::-1] valid_elections = valid_elections[count_order] counts = counts[count_order] sorted_codes = valid_elections.tolist() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k) } for i, k in enumerate(sorted_codes) } hdf["array_position"] = hdf["ElectionDate"].map( lambda x: int(sorted_codes_dict[x]["index"])) # The hist columns in the vdf are unecessary because we get a separate hist file that is more complete. hist_columns = [ col for col in vdf.columns if "voterhist" in col.lower() or "histmethod" in col.lower() ] vdf = self.config.coerce_numeric(vdf) vdf = self.config.coerce_strings(vdf) vdf = self.config.coerce_dates(vdf) vdf.drop(hist_columns, inplace=True) vdf.set_index(self.config["voter_id"], drop=False, inplace=True) voter_groups = hdf.groupby(self.config["voter_id"]) vdf["all_history"] = voter_groups["ElectionDate"].apply(list) vdf["sparse_history"] = voter_groups["array_position"].apply(list) vdf["votetype_history"] = voter_groups["VotingMethod"].apply(list) self.meta = { "message": "oklahoma_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(vdf.to_csv(encoding="utf-8", index=False)), s3_bucket=self.s3_bucket, )
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() new_files = self.unpack_files( file_obj=self.main_file) # array of dicts del self.main_file, self.temp_files gc.collect() if not self.ignore_checks: self.file_check(len(new_files)) for i in new_files: if ("ncvhis" in i["name"]) and (".txt" in i["name"]): vote_hist_file = i elif ("ncvoter" in i["name"]) and (".txt" in i["name"]): voter_file = i voter_df = self.read_csv_count_error_lines( voter_file["obj"], sep="\t", quotechar='"', encoding="latin-1", error_bad_lines=False, ) del voter_file gc.collect() vote_hist = self.read_csv_count_error_lines( vote_hist_file["obj"], sep="\t", quotechar='"', error_bad_lines=False, ) del vote_hist_file, new_files gc.collect() try: voter_df.columns = self.config["ordered_columns"] except ValueError: logging.info( "Incorrect number of columns found for the voter file in North Carolina" ) raise MissingNumColumnsError( "{} state is missing columns".format(self.state), self.state, len(self.config["ordered_columns"]), len(voter_df.columns), ) try: vote_hist.columns = self.config["hist_columns"] except ValueError: logging.info( "Incorrect number of columns found for the history file in North Carolina" ) raise valid_elections, counts = np.unique(vote_hist["election_desc"], return_counts=True) count_order = counts.argsort()[::-1] valid_elections = valid_elections[count_order] counts = counts[count_order] sorted_codes = valid_elections.tolist() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k) } for i, k in enumerate(sorted_codes) } vote_hist["array_position"] = vote_hist["election_desc"].map( lambda x: int(sorted_codes_dict[x]["index"])) del valid_elections, counts, count_order gc.collect() voter_groups = vote_hist.groupby(self.config["voter_id"]) all_history = voter_groups["array_position"].apply(list) vote_type = voter_groups["voting_method"].apply(list) voter_df = voter_df.set_index(self.config["voter_id"]) voter_df["all_history"] = all_history voter_df["vote_type"] = vote_type del voter_groups, vote_hist, all_history, vote_type gc.collect() voter_df = self.config.coerce_strings(voter_df) voter_df = self.config.coerce_dates(voter_df) voter_df = self.config.coerce_numeric( voter_df, extra_cols=[ "county_commiss_abbrv", "fire_dist_abbrv", "full_phone_number", "judic_dist_abbrv", "munic_dist_abbrv", "municipality_abbrv", "precinct_abbrv", "precinct_desc", "school_dist_abbrv", "super_court_abbrv", "township_abbrv", "township_desc", "vtd_abbrv", "vtd_desc", "ward_abbrv", ], ) self.meta = { "message": "north_carolina_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } self.is_compressed = False csv_obj = voter_df.to_csv(encoding="utf-8", index=True) del voter_df gc.collect() self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(csv_obj), s3_bucket=self.s3_bucket, ) del csv_obj gc.collect()