Exemplos de FileItem em Python, exemplos de reggie.ingestion.download.FileItem em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: auto_download.py Projeto: delaneygomen/reggie

def state_download(state, s3_bucket):
    config_file = Config.config_file_from_state(state=state)
    configs = Config(file_name=config_file)

    if state == "north_carolina":
        today = nc_date_grab()
        list_files = configs['data_chunk_links']
        zipped_files = []
        for i, url in enumerate(list_files):
            target_path = "/tmp/" + state + str(i) + ".zip"
            zipped_files.append(target_path)
            response = requests.get(url, stream=True)
            handle = open(target_path, "wb")
            for chunk in response.iter_content(chunk_size=512):
                if chunk:
                    handle.write(chunk)
            handle.close()
        file_to_zip = today + ".zip"
        with zipfile.ZipFile(file_to_zip, 'w') as myzip:
            for f in zipped_files:
                myzip.write(f)
        file_to_zip = FileItem(
            "NC file auto download",
            filename=file_to_zip,
            s3_bucket=s3_bucket)
        loader = Loader(config_file=config_file, force_date=today,
                        s3_bucket=s3_bucket)
        loader.s3_dump(file_to_zip, file_class=RAW_FILE_PREFIX)

    elif state == "ohio":
        today = str(ohio_get_last_updated().isoformat())[0:10]
        list_files = configs['data_chunk_links']
        file_names = configs['data_file_names']
        zipped_files = []
        for i, url in enumerate(list_files):
            logging.info("downloading {} file".format(url))
            target_path = "/tmp/" + state + "_" + file_names[i] + ".txt.gz"
            zipped_files.append(target_path)
            response = requests.get(url, stream=True, verify=False)
            handle = open(target_path, "wb")
            for chunk in response.iter_content(chunk_size=512):
                if chunk:
                    handle.write(chunk)
            handle.close()
            logging.info("downloaded {} file".format(url))
        file_to_zip = today + ".zip"
        logging.info("Zipping files")
        with zipfile.ZipFile(file_to_zip, 'w') as myzip:
            for f in zipped_files:
                myzip.write(f)
        logging.info("Uploading")
        file_to_zip = FileItem(
            "OH file auto download",
            filename=file_to_zip,
            s3_bucket=s3_bucket)
        loader = Loader(config_file=config_file, force_date=today,
                        s3_bucket=s3_bucket)
        loader.s3_dump(file_to_zip, file_class=RAW_FILE_PREFIX)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: oregon_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = [
            f for f in self.unpack_files(self.main_file, compression="unzip")
            if "readme" not in f["name"].lower()
        ]

        voter_file = [
            n for n in new_files if not "readme" in n["name"].lower()
        ][0]
        # don't have access to them yet
        # hist_files = ...

        # --- handling voter file --- #

        df_voter = pd.read_csv(voter_file["obj"], sep="\t",
                               dtype=str).dropna(how="all", axis=1)

        df_voter = self.config.coerce_dates(df_voter)
        df_voter = self.config.coerce_strings(df_voter, exclude=["STATE"])
        df_voter = self.config.coerce_numeric(df_voter)

        df_voter.loc[:, self.config["voter_id"]] = (
            df_voter.loc[:,
                         self.config["voter_id"]].str.zfill(9).astype("str"))
        df_voter.loc[:, "UNLISTED"] = df_voter.loc[:, "UNLISTED"].map({
            "yes":
            True,
            "no":
            False
        })
        # when vote history is received

        self.meta = {
            "message": "oregon_{}".format(datetime.now().isoformat()),
            # vote history not available yet
            # 'array_encoding': json.dumps(),
            # 'array_decoding': json.dumps()
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 3

0

Exibir arquivo

Arquivo: iowa_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        def is_first_file(fname):
            if "CD1" in fname:
                if ("Part1" in fname) or ("Part 1" in fname):
                    return True
            return False

        new_files = self.unpack_files(
            file_obj=self.main_file, compression="unzip"
        )
        logging.info("IOWA: reading in voter file")

        first_file = [f for f in new_files if is_first_file(f["name"])][0]
        remaining_files = [
            f for f in new_files if not is_first_file(f["name"])
        ]
        if not self.ignore_checks:
            # add 1 for firs file
            valid_files = len(remaining_files) + 1
            self.file_check(valid_files)

        buffer_cols = [
            "buffer0",
            "buffer1",
            "buffer2",
            "buffer3",
            "buffer4",
            "buffer5",
            "buffer6",
            "buffer7",
            "buffer8",
            "buffer9",
        ]

        # Reads the headers in on the first file given
        headers = pd.read_csv(first_file["obj"], nrows=1).columns

        # Gather the columns for renaming in order to fit the original schema in the database and then rename
        # so that the columns in the header will fit what is expected
        column_rename_dict = self.config["rename_columns"]
        normalized_headers = [
            x if x not in column_rename_dict else column_rename_dict[x]
            for x in headers
        ]
        normalized_headers = [x.replace(" ", "_") for x in normalized_headers]

        columns_to_check = [
            x
            for x in normalized_headers
            if x not in self.config["election_columns"]
        ]
        self.column_check(columns_to_check)

        # Add the buffer columns back in for lines that contain extra commas
        headers_with_buffers = normalized_headers + buffer_cols

        # Begin reading the file with the correct headers
        df_voters = self.read_csv_count_error_lines(
            first_file["obj"],
            skiprows=1,
            header=None,
            names=headers_with_buffers,
            error_bad_lines=False,
        )

        for i in remaining_files:
            skiprows = 1 if "Part1" in i["name"] else 0
            new_df = self.read_csv_count_error_lines(
                i["obj"],
                header=None,
                skiprows=skiprows,
                names=headers_with_buffers,
                error_bad_lines=False,
            )
            df_voters = pd.concat([df_voters, new_df], axis=0)

        key_delim = "_"
        df_voters["all_history"] = ""
        df_voters = df_voters[df_voters.COUNTY != "COUNTY"]

        # instead of iterating over all of the columns for each row, we should
        # handle all this beforehand.
        # also we should not compute the unique values until after, not before
        df_voters.drop(columns=buffer_cols, inplace=True)

        for c in self.config["election_dates"]:
            null_rows = df_voters[c].isnull()
            df_voters[c][null_rows] = ""

            # each key contains info from the columns
            prefix = c.split("_")[0] + key_delim

            # and the corresponding votervotemethod column
            vote_type_col = c.replace("ELECTION_DATE", "VOTERVOTEMETHOD")
            null_rows = df_voters[vote_type_col].isnull()
            df_voters[vote_type_col].loc[null_rows] = ""
            # add election type and date
            df_voters[c] = prefix + df_voters[c].str.strip()
            # add voting method
            df_voters[c] += key_delim + df_voters[vote_type_col].str.strip()

            # the code below will format each key as
            # <election_type>_<date>_<voting_method>_<political_party>_
            # <political_org>
            if "PRIMARY" in prefix:

                # so far so good but we need more columns in the event of a
                # primary
                org_col = c.replace(
                    "PRIMARY_ELECTION_DATE", "POLITICAL_ORGANIZATION"
                )
                party_col = c.replace(
                    "PRIMARY_ELECTION_DATE", "POLITICAL_PARTY"
                )
                df_voters[org_col].loc[df_voters[org_col].isnull()] = ""
                df_voters[party_col].loc[df_voters[party_col].isnull()] = ""
                party_info = (
                    df_voters[party_col].str.strip()
                    + key_delim
                    + df_voters[org_col].str.replace(" ", "")
                )
                df_voters[c] += key_delim + party_info
            else:
                # add 'blank' values for the primary slots
                df_voters[c] += key_delim + key_delim

            df_voters[c] = df_voters[c].str.replace(prefix + key_delim * 3, "")
            df_voters[c] = df_voters[c].str.replace('"', "")
            df_voters[c] = df_voters[c].str.replace("'", "")
            df_voters.all_history += " " + df_voters[c]

        # make into an array (null values are '' so they are ignored)
        df_voters.all_history = df_voters.all_history.str.split()
        elections, counts = np.unique(
            df_voters[self.config["election_dates"]], return_counts=True
        )
        # we want reverse order (lower indices are higher frequency)
        count_order = counts.argsort()[::-1]
        elections = elections[count_order]
        counts = counts[count_order]

        # create meta
        sorted_codes_dict = {
            j: {"index": i, "count": int(counts[i]), "date": date_from_str(j)}
            for i, j in enumerate(elections)
        }

        default_item = {"index": len(elections)}

        def ins_code_bin(a):
            return [sorted_codes_dict.get(k, default_item)["index"] for k in a]

        # In an instance like this, where we've created our own systematized
        # labels for each election I think it makes sense to also keep them
        # in addition to the sparse history
        df_voters["sparse_history"] = df_voters.all_history.apply(ins_code_bin)

        self.meta = {
            "message": "iowa_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(elections.tolist()),
        }
        for c in df_voters.columns:
            df_voters[c].loc[df_voters[c].isnull()] = ""

        for c in df_voters.columns:
            df_voters[c] = (
                df_voters[c]
                .astype(str)
                .str.encode("utf-8", errors="ignore")
                .str.decode("utf-8")
            )

        df_voters = self.config.coerce_dates(df_voters)
        df_voters = self.config.coerce_numeric(
            df_voters,
            extra_cols=[
                "COMMUNITY_COLLEGE",
                "COMMUNITY_COLLEGE_DIRECTOR",
                "LOSST_CONTIGUOUS_CITIES",
                "PRECINCT",
                "SANITARY",
                "SCHOOL_DIRECTOR",
                "UNIT_NUM",
            ],
        )
        # force reg num to be integer
        df_voters["REGN_NUM"] = pd.to_numeric(
            df_voters["REGN_NUM"], errors="coerce"
        ).fillna(0)
        df_voters["REGN_NUM"] = df_voters["REGN_NUM"].astype(int)

        # Drop the election columns because they are no longer needed
        df_voters.drop(columns=self.config["election_columns"], inplace=True)

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voters.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 4

0

Exibir arquivo

Arquivo: maryland_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file, compression="unzip")
        hist_file = [
            n for n in new_files if "history.txt" in n["name"].lower()
        ][0]
        voter_file = [n for n in new_files
                      if "list.txt" in n["name"].lower()][0]
        # separate file with extra data, absentee votes still counted in voter_file
        abs_file = [
            n for n in new_files if "absentee.txt" in n["name"].lower()
        ][0]

        # --- handling voter history --- #
        df_hist = pd.read_csv(hist_file["obj"], sep="\t", dtype=str)

        election_dates = pd.to_datetime(df_hist.loc[:, "Election Date"])
        election_names = (df_hist.loc[:, "Election Description"].str.extract(
            "(\D+)", expand=False).str.strip().str.replace(" ", "_"))

        df_hist.loc[:,
                    "all_history"] = (election_dates.dt.strftime("%Y_%m_%d_") +
                                      election_names.str.lower())
        df_hist.loc[:, "earlyvote_history"] = "N"
        df_hist.loc[~df_hist.loc[:, "Early Voting Location"].isna(),
                    "earlyvote_history", ] = "Y"
        df_hist.loc[:, "votetype_history"] = (
            df_hist.loc[:, "Voting Method"].str.replace(" ", "_").str.lower())
        df_hist.loc[:,
                    "party_history"] = df_hist.loc[:,
                                                   "Political Party"].str.lower(
                                                   )
        df_hist.loc[:, "jurisd_history"] = (
            df_hist.loc[:, "Jurisdiction Code"].astype(str).str.strip())
        df_hist.loc[:, "Election Date"] = pd.to_datetime(
            df_hist.loc[:, "Election Date"])

        elections = (df_hist.groupby(
            ["all_history",
             "Election Date"])["Voter ID"].count().reset_index().values)
        sorted_elections_dict = {
            k[0]: {
                "index": i,
                "count": int(k[2]),
                "date": k[1].strftime("%Y-%m-%d"),
            }
            for i, k in enumerate(elections)
        }
        sorted_elections = list(sorted_elections_dict.keys())

        df_hist.loc[:, "sparse_history"] = df_hist.all_history.apply(
            lambda x: sorted_elections_dict[x]["index"])
        history = list(df_hist.loc[:, "all_history":].columns)
        df_hist = df_hist.loc[:, ["Voter ID"] + history].rename(
            {"Voter ID": self.config["voter_id"]}, axis=1)
        group = df_hist.groupby(self.config["voter_id"])
        df_hist = pd.concat(
            [group[c].apply(list) for c in df_hist.columns[1:]], axis=1)

        # --- handling voter file --- #
        df_voter = (pd.read_csv(
            voter_file["obj"], sep="\t",
            dtype=str).iloc[:, :len(self.config["column_names"])].set_index(
                self.config["voter_id"]))
        df_voter = self.config.coerce_strings(df_voter)
        df_voter = self.config.coerce_numeric(df_voter)
        df_voter = self.config.coerce_dates(df_voter)

        df_voter = df_voter.join(df_hist)

        self.meta = {
            "message": "maryland_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_elections_dict),
            "array_decoding": json.dumps(sorted_elections),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8", index=True)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 5

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        logging.info("preprocessing florida")
        # new_files is list of dicts, i.e. [{"name":.. , "obj": <fileobj>}, ..]
        new_files = self.unpack_files(compression="unzip",
                                      file_obj=self.main_file)
        del self.main_file, self.temp_files
        gc.collect()

        vote_history_files = []
        voter_files = []
        for i in new_files:
            if "_H_" in i["name"]:
                vote_history_files.append(i)
            elif ".txt" in i["name"]:
                voter_files.append(i)

        if not self.ignore_checks:
            self.file_check(len(voter_files))
        concat_voter_file = concat_and_delete(voter_files)
        concat_history_file = concat_and_delete(vote_history_files)
        del new_files, vote_history_files, voter_files
        gc.collect()

        logging.info("FLORIDA: loading voter history file")
        df_hist = pd.read_fwf(concat_history_file, header=None)
        try:
            df_hist.columns = self.config["hist_columns"]
        except ValueError:
            logging.info("Incorrect history columns found in Florida")
            raise MissingNumColumnsError(
                "{} state history is missing columns".format(self.state),
                self.state,
                len(self.config["hist_columns"]),
                len(df_hist.columns),
            )
        del concat_history_file
        gc.collect()

        df_hist = df_hist[df_hist["date"].map(lambda x: len(x)) > 5]
        df_hist["election_name"] = (df_hist["date"] + "_" +
                                    df_hist["election_type"])
        valid_elections, counts = np.unique(df_hist["election_name"],
                                            return_counts=True)
        date_order = [
            idx for idx, election in sorted(
                enumerate(valid_elections),
                key=lambda x: datetime.strptime(x[1][:-4], "%m/%d/%Y"),
                reverse=True,
            )
        ]
        valid_elections = valid_elections[date_order]
        counts = counts[date_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }

        df_hist["array_position"] = df_hist["election_name"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))
        del valid_elections, counts, date_order
        gc.collect()

        logging.info("FLORIDA: history apply")
        voter_groups = df_hist.groupby("VoterID")
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["vote_type"].apply(list)
        del voter_groups, df_hist
        gc.collect()

        logging.info("FLORIDA: loading main voter file")
        df_voters = self.read_csv_count_error_lines(concat_voter_file,
                                                    header=None,
                                                    sep="\t",
                                                    error_bad_lines=False)
        del concat_voter_file
        gc.collect()

        try:
            df_voters.columns = self.config["ordered_columns"]
        except ValueError:
            logging.info("Incorrect number of columns found for Flordia")
            raise MissingNumColumnsError(
                "{} state is missing voters columns".format(self.state),
                self.state,
                len(self.config["ordered_columns"]),
                len(df_voters.columns),
            )
        df_voters = df_voters.set_index(self.config["voter_id"])

        df_voters["all_history"] = all_history
        df_voters["vote_type"] = vote_type
        del all_history, vote_type
        gc.collect()

        df_voters = self.config.coerce_strings(df_voters)
        df_voters = self.config.coerce_dates(df_voters)
        df_voters = self.config.coerce_numeric(
            df_voters,
            extra_cols=[
                "Precinct",
                "Precinct_Split",
                "Daytime_Phone_Number",
                "Daytime_Area_Code",
                "Daytime_Phone_Extension",
                "Daytime_Area_Code",
                "Daytime_Phone_Extension",
                "Mailing_Zipcode",
                "Residence_Zipcode",
                "Mailing_Address_Line_1",
                "Mailing_Address_Line_2",
                "Mailing_Address_Line_3",
                "Residence_Address_Line_1",
                "Residence_Address_Line_2",
            ],
        )

        self.meta = {
            "message": "florida_{}".format(datetime.now().isoformat()),
            "array_encoding": sorted_codes_dict,
            "array_decoding": sorted_codes,
        }

        csv_obj = df_voters.to_csv(encoding="utf-8")
        del df_voters
        gc.collect()

        logging.info("FLORIDA: writing out")
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()

Exemplo n.º 6

0

Exibir arquivo

Arquivo: new_york_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(
            file_obj=self.main_file, compression="infer"
        )

        if not self.ignore_checks:
            self.file_check(len(new_files))

        # no longer include pdfs in file list anyway, can just assign main file
        self.main_file = new_files[0]
        gc.collect()
        main_df = self.read_csv_count_error_lines(
            self.main_file["obj"],
            header=None,
            encoding="latin-1",
            error_bad_lines=False,
        )

        # In Dec 2021, NY added 2 columns (RAPARTMENTTYPE, RADDRNONSTD),
        # and rearranged the other address columns slightly.
        # Since these columns are not very useful /
        # affect only a small set of voters (140k),
        # We are going to drop them for now to allow for
        # easy compatibility with past data.
        if len(main_df.columns) > len(self.config["ordered_columns"]):
            main_df.drop(main_df.columns[11], axis=1, inplace=True)
            main_df.drop(main_df.columns[9], axis=1, inplace=True)

            # Rearrange columns slightly to fit pre-Dec 2021 data
            ordered_cols = main_df.columns.to_list()
            ordered_cols = ordered_cols[:6] + [ordered_cols[9]] + ordered_cols[6:9] + ordered_cols[10:]
            main_df = main_df[ordered_cols]

            # They also changed some of the status codes to abbreviations
            # So this maps them back to what they used to be
            main_df[41] = main_df[41].map(
                self.config["status_codes_remap"]
            )

        # apply column names
        main_df.columns = self.config["ordered_columns"]

        logging.info(
            "dataframe memory usage: {}".format(
                main_df.memory_usage(deep=True).sum()
            )
        )
        del self.main_file, self.temp_files, new_files
        gc.collect()

        null_hists = main_df.voterhistory != main_df.voterhistory
        main_df.voterhistory[null_hists] = NULL_CHAR
        all_codes = (
            main_df.voterhistory.str.replace(" ", "_")
            .str.replace("[", "")
            .str.replace("]", "")
        )
        all_codes = all_codes.str.cat(sep=";")
        all_codes = np.array(all_codes.split(";"))
        logging.info("Making all_history")
        main_df["all_history"] = strcol_to_array(
            main_df.voterhistory, delim=";"
        )
        unique_codes, counts = np.unique(all_codes, return_counts=True)
        del all_codes, null_hists
        gc.collect()

        count_order = counts.argsort()
        unique_codes = unique_codes[count_order]
        counts = counts[count_order]
        sorted_codes = unique_codes.tolist()
        del unique_codes, count_order
        gc.collect()

        sorted_codes_dict = {
            k: {"index": i, "count": int(counts[i])}
            for i, k in enumerate(sorted_codes)
        }
        del counts
        gc.collect()

        def insert_code_bin(arr):
            return [sorted_codes_dict[k]["index"] for k in arr]

        # in this case we save ny as sparse array since so many elections are
        # stored
        logging.info("Mapping history codes")
        main_df.all_history = main_df.all_history.map(insert_code_bin)

        main_df = self.config.coerce_dates(main_df)
        main_df = self.config.coerce_strings(main_df)
        main_df = self.config.coerce_numeric(
            main_df,
            extra_cols=[
                "raddnumber",
                "rhalfcode",
                "rapartment",
                "rzip5",
                "rzip4",
                "mailadd4",
                "ward",
                "countyvrnumber",
                "lastvoteddate",
                "prevyearvoted",
                "prevcounty",
            ],
        )
        self.meta = {
            "message": "new_york_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        del sorted_codes, sorted_codes_dict
        gc.collect()

        csv_obj = main_df.to_csv(encoding="utf-8", index=False)
        del main_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()

Exemplo n.º 7

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        logging.info("GEORGIA: loading voter and voter history file")
        new_files = self.unpack_files(compression="unzip",
                                      file_obj=self.main_file)
        del self.main_file, self.temp_files
        gc.collect()

        voter_files = []
        vh_files = []
        for i in new_files:
            if "Georgia_Daily_VoterBase.txt".lower() in i["name"].lower():
                logging.info("Detected voter file: " + i["name"])
                voter_files.append(i)
            elif "txt" in i["name"].lower():
                vh_files.append(i)
        logging.info("Detected {} history files".format(len(vh_files)))
        del new_files
        gc.collect()

        if not self.ignore_checks:
            self.file_check(len(voter_files))

        df_voters = self.read_csv_count_error_lines(
            voter_files[0]["obj"],
            sep="|",
            quotechar='"',
            quoting=3,
            error_bad_lines=False,
        )
        del voter_files
        gc.collect()

        try:
            df_voters.columns = self.config["ordered_columns"]
        except ValueError:
            logging.info("Incorrect number of columns found for Georgia")
            raise MissingNumColumnsError(
                "{} state is missing columns".format(self.state),
                self.state,
                len(self.config["ordered_columns"]),
                len(df_voters.columns),
            )
        df_voters["Registration_Number"] = (
            df_voters["Registration_Number"].astype(str).str.zfill(8))

        concat_history_file = concat_and_delete(vh_files)
        del vh_files
        gc.collect()

        logging.info("Performing GA history manipulation")

        history = self.read_csv_count_error_lines(
            concat_history_file,
            sep="  ",
            names=["Concat_str", "Other"],
            error_bad_lines=False,
        )
        del concat_history_file
        gc.collect()

        history["County_Number"] = history["Concat_str"].str[0:3]
        history["Registration_Number"] = history["Concat_str"].str[3:11]
        history["Election_Date"] = history["Concat_str"].str[11:19]
        history["Election_Type"] = history["Concat_str"].str[19:22]
        history["Party"] = history["Concat_str"].str[22:24]
        history["Absentee"] = history["Other"].str[0]
        history["Provisional"] = history["Other"].str[1]
        history["Supplimental"] = history["Other"].str[2]
        type_dict = {
            "001": "GEN_PRIMARY",
            "002": "GEN_PRIMARY_RUNOFF",
            "003": "GEN",
            "004": "GEN_ELECT_RUNOFF",
            "005": "SPECIAL_ELECT",
            "006": "SPECIAL_RUNOFF",
            "007": "NON-PARTISAN",
            "008": "SPECIAL_NON-PARTISAN",
            "009": "RECALL",
            "010": "PPP",
        }
        history = history.replace({"Election_Type": type_dict})
        history["Combo_history"] = history["Election_Date"].str.cat(
            others=history[[
                "Election_Type",
                "Party",
                "Absentee",
                "Provisional",
                "Supplimental",
            ]],
            sep="_",
        )
        history = history.filter(items=[
            "County_Number",
            "Registration_Number",
            "Election_Date",
            "Election_Type",
            "Party",
            "Absentee",
            "Provisional",
            "Supplimental",
            "Combo_history",
        ])
        history = history.dropna()

        logging.info("Creating GA sparse history")

        valid_elections, counts = np.unique(history["Combo_history"],
                                            return_counts=True)

        date_order = [
            idx for idx, election in sorted(
                enumerate(valid_elections),
                key=lambda x: datetime.strptime(x[1][0:8], "%Y%m%d"),
                reverse=True,
            )
        ]

        valid_elections = valid_elections[date_order]
        counts = counts[date_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": datetime.strptime(k[0:8], "%Y%m%d"),
            }
            for i, k in enumerate(sorted_codes)
        }
        history["array_position"] = history["Combo_history"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))

        voter_groups = history.groupby("Registration_Number")
        all_history = voter_groups["Combo_history"].apply(list)
        all_history_indices = voter_groups["array_position"].apply(list)
        df_voters = df_voters.set_index("Registration_Number")
        df_voters["party_identifier"] = "npa"
        df_voters["all_history"] = all_history
        df_voters["sparse_history"] = all_history_indices
        del history, voter_groups, all_history, all_history_indices
        gc.collect()

        df_voters = self.config.coerce_dates(df_voters)
        df_voters = self.config.coerce_numeric(
            df_voters,
            extra_cols=[
                "Land_district",
                "Mail_house_nbr",
                "Land_lot",
                "Commission_district",
                "School_district",
                "Ward city council_code",
                "County_precinct_id",
                "Judicial_district",
                "County_district_a_value",
                "County_district_b_value",
                "City_precinct_id",
                "Mail_address_2",
                "Mail_address_3",
                "Mail_apt_unit_nbr",
                "Mail_country",
                "Residence_apt_unit_nbr",
            ],
        )

        self.meta = {
            "message":
            "georgia_{}".format(datetime.now().isoformat()),
            "array_encoding":
            json.dumps(sorted_codes_dict,
                       indent=4,
                       sort_keys=True,
                       default=str),
            "array_decoding":
            json.dumps(sorted_codes),
            "election_type":
            json.dumps(type_dict),
        }

        csv_obj = df_voters.to_csv(encoding="utf-8")
        del df_voters
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()

Exemplo n.º 8

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file, compression="unzip")
        voter_files = [n for n in new_files if "EXT" in n["name"]]

        election_columns = self.config["election_columns"]
        electiontype_columns = self.config["electiontype_columns"]
        votetype_columns = self.config["votetype_columns"]
        election_date_columns = self.config["election_date_columns"]

        electiontype_codes = {
            v: k
            for k, v in self.config["election_type_code"].items()
        }
        votetype_codes = {
            v: k
            for k, v in self.config["absentee_ballot_code"].items()
        }

        df_voter = pd.concat(
            [
                pd.read_csv(
                    f["obj"],
                    names=self.config["column_names"],
                    index_col=False,
                    sep=",",
                    dtype=str,
                    skipinitialspace=True,
                ) for f in voter_files
            ],
            ignore_index=True,
        )

        # --- handling the vote history file --- #

        df_hist = df_voter.set_index(
            self.config["voter_id"]).loc[:, election_columns]

        election_df = []
        election_zip = list(
            zip(election_date_columns, electiontype_columns, votetype_columns))

        for c in election_zip:
            election = df_hist.loc[~df_hist[c[0]].isna(), c]
            election.columns = ["electiondate", "electiontype", "votetype"]
            electiondate = pd.to_datetime(election.loc[:, "electiondate"])

            election.loc[:, "electiondate"] = electiondate
            election.loc[:, "electiontype"] = (
                election.loc[:, "electiontype"].str.strip().map(
                    electiontype_codes).fillna("NP").str.lower())
            election.loc[:, "votetype_history"] = (
                election.loc[:, "votetype"].str.strip().map(
                    votetype_codes).fillna("NP").str.lower())
            election.loc[:, "all_history"] = (
                electiondate.dt.strftime("%Y_%m_%d_") +
                election.loc[:, "electiontype"])

            election = election.loc[:, [
                "electiondate", "votetype_history", "all_history"
            ]]

            election_df.append(election)

        df_hist = pd.concat(election_df).reset_index()

        elections = (df_hist.groupby([
            "all_history", "electiondate"
        ])[self.config["voter_id"]].count().reset_index().values)
        sorted_elections_dict = {
            k[0]: {
                "index": i,
                "count": int(k[2]),
                "date": k[1].strftime("%Y-%m-%d"),
            }
            for i, k in enumerate(elections)
        }
        sorted_elections = list(sorted_elections_dict.keys())

        df_hist = df_hist.drop("electiondate", axis=1)

        df_hist.loc[:, "sparse_history"] = df_hist.all_history.map(
            lambda x: sorted_elections_dict[x]["index"])

        group = df_hist.groupby(self.config["voter_id"])
        election_df = []

        for c in df_hist.columns[1:]:
            election_df.append(group[c].apply(list))

        df_hist = pd.concat(election_df, axis=1)

        # --- handling the voter file --- #

        df_voter = df_voter.loc[:, ~df_voter.columns.isin(election_columns)]
        df_voter = df_voter.set_index(self.config["voter_id"])

        df_voter = self.config.coerce_strings(df_voter)
        df_voter = self.config.coerce_numeric(df_voter)
        df_voter = self.config.coerce_dates(df_voter)
        df_voter = df_voter.join(df_hist)

        self.meta = {
            "message": "connecticut_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_elections_dict),
            "array_decoding": json.dumps(sorted_elections),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8", index=True)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 9

0

Exibir arquivo

Arquivo: pennsylvania_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        config = Config(file_name=self.config_file)
        new_files = self.unpack_files(file_obj=self.main_file)
        del self.main_file, self.temp_files
        gc.collect()

        voter_files = [f for f in new_files if "FVE" in f["name"]]
        election_maps = [f for f in new_files if "Election Map" in f["name"]]
        zone_codes = [f for f in new_files if "Codes" in f["name"]]
        zone_types = [f for f in new_files if "Types" in f["name"]]
        del new_files
        gc.collect()

        if not self.ignore_checks:
            # election maps need to line up to voter files?
            self.file_check(len(voter_files), len(election_maps))
        counties = config["county_names"]
        main_df = None
        # Preserving the order of the file sent, but concatinating the district and election columns which were
        # dropped in the legacy processed file
        dfcols = (config["ordered_columns"][:-3] + config["district_columns"] +
                  config["election_columns"] + config["ordered_columns"][-3:])

        # create a mapping that returns a series based on the values across rows (voters) of cells (election info).
        # consolidates the non nan values into one string that can be appended as a column later for the all_history and
        # the districts columns
        def list_map(df_sub, columns, zone_dict=None):
            def mapping(li, zone_dict=zone_dict):
                if zone_dict is None:
                    li = [x for x in li if x != "nan"]
                    return li
                else:
                    li = [
                        zone_dict[x] for x in li
                        if x != "nan" and x in zone_dict
                    ]
                    return li

            return pd.Series(
                map(mapping, df_sub[columns].values.astype(str).tolist()))

        sorted_codes = []
        sorted_code_dict = defaultdict(defaultdict)
        dtypes = {col: "str" for col in dfcols}
        for idx, c in enumerate(counties):
            logging.info("Processing {} {}/{}".format(c, idx, len(counties)))
            c = format_column_name(c)
            try:
                voter_file = next(f for f in voter_files
                                  if c in f["name"].lower())
                election_map = next(f for f in election_maps
                                    if c in f["name"].lower())
                zones = next(f for f in zone_codes if c in f["name"].lower())
                types = next(f for f in zone_types if c in f["name"].lower())
            except StopIteration:
                continue
            df = self.read_csv_count_error_lines(
                voter_file["obj"],
                sep="\t",
                names=dfcols,
                error_bad_lines=False,
                dtype=dtypes,
            )
            edf = self.read_csv_count_error_lines(
                election_map["obj"],
                sep="\t",
                names=["county", "number", "title", "date"],
                error_bad_lines=False,
                dtype={
                    "county": str,
                    "number": str,
                    "title": str,
                    "date": str,
                },
            )
            zdf = self.read_csv_count_error_lines(
                zones["obj"],
                sep="\t",
                names=[
                    "county_name",
                    "zone_number",
                    "zone_code",
                    "zone_description",
                ],
                error_bad_lines=False,
            )
            tdf = self.read_csv_count_error_lines(
                types["obj"],
                sep="\t",
                names=[
                    "county_name",
                    "zone_number",
                    "zone_short_name",
                    "zone_long_name",
                ],
                error_bad_lines=False,
            )

            # Refactor note: format the election data into the format expected in the original all_history column
            edf["election_list"] = edf["title"] + " " + edf["date"]

            # Gather the positional vote and distict columns
            district_columns = df.columns[30:70].to_list()
            vote_columns = df.columns[70:150].to_list()

            # create a dict of the formatted election data using the index number in the given file, this
            # corresponds to the column index beginning at the start of the vote columns in the dataframe
            # Index begins starting at 1
            election_map = pd.Series(edf.election_list.values,
                                     index=edf.number).to_dict()

            # merge the zone files together to consolidate the information in one dataframe
            zdf = zdf.merge(tdf, how="left", on="zone_number")
            # format a column field that contains the zone description and the name so
            # that it matches the current district field
            zdf["combined"] = (zdf["zone_description"] + " Type: " +
                               zdf["zone_long_name"])

            # create a dict that utilizes the zone code as the key and the long name string as the value
            zone_dict = dict(zip(zdf.zone_code.astype(str), zdf.combined))

            # Gather the pairs of election columns to iterate over both at the same time to collect the information
            # contained in both of the columns per election
            vote_column_list = list(
                zip(df.columns[70:150:2], df.columns[71:150:2]))

            # get the value from the eleciton map key for the election name,
            # then combine it with the value in the party and vote type cells for the full election information
            # Creates a history dataframe containing, as cells, the election name as gathered in the election file, the
            # vote type (AP, A etc), and the party information, all separated by spaces
            # The columns are all named election_#_vote_type but the cells contain the relevant information
            vote_hist_df = pd.DataFrame({
                i: election_map[i.split("_")[1]] + " " + df[i] + " " + df[j]
                for i, j in vote_column_list if i.split("_")[1] in election_map
            })

            # counts for the metadata
            counts = vote_hist_df.count()
            for i in counts.index:
                current_key = election_map[i.split("_")[1]]

                # Metadata needs to be _ separated not space
                current_key = "_".join(current_key.split())
                if current_key in sorted_code_dict:
                    sorted_code_dict[current_key]["count"] += int(counts[i])
                else:
                    current_date = edf.loc[edf["number"] == i.split("_")
                                           [1]]["date"].values[0]
                    new_dict_entry = defaultdict(str)
                    new_dict_entry["date"] = current_date
                    new_dict_entry["count"] = int(counts[i])
                    sorted_code_dict[current_key] = new_dict_entry
            # converts the dataframe to a series that contains the list of elections participate in indexed on position
            vote_hist_df = list_map(vote_hist_df, vote_hist_df.columns)
            districts = list_map(df[district_columns], district_columns,
                                 zone_dict)

            df["all_history"] = vote_hist_df
            df["districts"] = districts
            df.drop(vote_columns, axis=1, inplace=True)
            df.drop(district_columns, axis=1, inplace=True)

            cols_to_check = [
                col for col in list(df.columns)
                if col not in vote_columns and col not in district_columns
            ]

            self.column_check(list(df.columns), cols_to_check)
            if main_df is None:
                main_df = df
            else:
                main_df = pd.concat([main_df, df], ignore_index=True)

        del voter_files, election_maps, zone_codes, zone_types
        gc.collect()

        sorted_keys = sorted(sorted_code_dict.items(),
                             key=lambda x: parser.parse(x[1]["date"]))
        for index, key in enumerate(sorted_keys):
            sorted_code_dict[key[0]]["index"] = index
            sorted_codes.append(key[0])
        del sorted_keys
        gc.collect()

        logging.info("coercing")
        main_df = config.coerce_dates(main_df)
        main_df = config.coerce_numeric(
            main_df,
            extra_cols=[
                "house_number",
                "apartment_number",
                "address_line_2",
                "zip",
                "mail_address_1",
                "mail_address_2",
                "mail_zip",
                "precinct_code",
                "precinct_split_id",
                "legacy_id",
                "home_phone",
            ],
        )
        logging.info("Writing CSV")
        self.meta = {
            "message": "pennsylvania_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_code_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        csv_obj = main_df.to_csv(encoding="utf-8", index=False)
        del main_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()

Exemplo n.º 10

0

Exibir arquivo

Arquivo: kansas_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(file_obj=self.main_file,
                                      compression="unzip")

        if not self.ignore_checks:
            self.file_check(len(new_files))

        for f in new_files:
            if ((".txt" in f["name"]) and ("._" not in f["name"])
                    and ("description" not in f["name"].lower())):
                logging.info("reading kansas file from {}".format(f["name"]))
                df = self.read_csv_count_error_lines(
                    f["obj"],
                    sep="\t",
                    index_col=False,
                    engine="c",
                    error_bad_lines=False,
                    encoding="latin-1",
                )
        try:
            df.columns = self.config["ordered_columns"]
        except ValueError:
            try:
                df.columns = self.config["ordered_columns_new"]
            except ValueError:
                logging.info("Incorrect number of columns found for Kansas")
                raise MissingNumColumnsError(
                    "{} state is missing columns".format(self.state),
                    self.state,
                    len(self.config["ordered_columns_new"]),
                    len(df.columns),
                )

            for i in set(list(self.config["ordered_columns"])) - set(
                    list(self.config["ordered_columns_new"])):
                df[i] = None
        df[self.config["voter_status"]] = df[
            self.config["voter_status"]].str.replace(" ", "")

        def ks_hist_date(s):
            try:
                elect_year = parser.parse(s[2:6]).year
            except:
                elect_year = -1
                pass
            if (elect_year < 1850) or (elect_year > date.today().year + 1):
                elect_year = None
            return elect_year

        def add_history(main_df):
            count_df = pd.DataFrame()
            for idx, hist in enumerate(self.config["hist_columns"]):
                unique_codes, counts = np.unique(
                    main_df[hist].str.replace(" ", "_").dropna().values,
                    return_counts=True,
                )
                count_df_new = pd.DataFrame(index=unique_codes,
                                            data=counts,
                                            columns=["counts_" + hist])
                count_df = pd.concat([count_df, count_df_new], axis=1)
            count_df["total_counts"] = count_df.sum(axis=1)
            unique_codes = count_df.index.values
            counts = count_df["total_counts"].values
            count_order = counts.argsort()
            unique_codes = unique_codes[count_order]
            counts = counts[count_order]
            sorted_codes = unique_codes.tolist()
            sorted_codes_dict = {
                k: {
                    "index": i,
                    "count": int(counts[i]),
                    "date": ks_hist_date(k),
                }
                for i, k in enumerate(sorted_codes)
            }

            def insert_code_bin(arr):
                return [sorted_codes_dict[k]["index"] for k in arr]

            main_df["all_history"] = main_df[
                self.config["hist_columns"]].apply(
                    lambda x: list(x.dropna().str.replace(" ", "_")), axis=1)
            main_df.all_history = main_df.all_history.map(insert_code_bin)
            return sorted_codes, sorted_codes_dict

        sorted_codes, sorted_codes_dict = add_history(main_df=df)

        df = self.config.coerce_numeric(df)
        df = self.config.coerce_strings(df)
        df = self.config.coerce_dates(df)
        self.meta = {
            "message": "kansas_{}".format(datetime.now().isoformat()),
            "array_encoding": sorted_codes_dict,
            "array_decoding": sorted_codes,
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 11

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file, compression="unzip")

        if not self.ignore_checks:
            self.file_check(len(new_files))
        voter_file = (
            new_files[0] if "ElgbVtr" in new_files[0]["name"] else new_files[1]
        )
        hist_file = (
            new_files[0] if "VtHst" in new_files[0]["name"] else new_files[1]
        )

        df_hist = self.read_csv_count_error_lines(
            hist_file["obj"], header=None, error_bad_lines=False
        )
        df_hist.columns = self.config["hist_columns"]
        df_voters = self.read_csv_count_error_lines(
            voter_file["obj"], header=None, error_bad_lines=False
        )
        del self.main_file, self.temp_files, voter_file, hist_file, new_files
        gc.collect()

        try:
            df_voters.columns = self.config["ordered_columns"]
        except ValueError:
            logging.info("Incorrect number of columns found for Nevada")
            raise MissingNumColumnsError(
                "{} state is missing columns".format(self.state),
                self.state,
                len(self.config["ordered_columns"]),
                len(df_voters.columns),
            )

        sorted_codes = df_hist.date.unique().tolist()
        sorted_codes.sort(key=lambda x: datetime.strptime(x, "%m/%d/%Y"))
        counts = df_hist.date.value_counts()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts.loc[k]),
                "date": date_from_str(k),
            }
            for i, k in enumerate(sorted_codes)
        }

        def insert_code_bin(arr):
            if isinstance(arr, list):
                return [sorted_codes_dict[k]["index"] for k in arr]
            else:
                return np.nan

        df_voters = df_voters.set_index("VoterID", drop=False)
        voter_id_groups = df_hist.groupby("VoterID")
        df_voters["all_history"] = voter_id_groups["date"].apply(list)
        df_voters["votetype_history"] = voter_id_groups["vote_code"].apply(
            list
        )
        del df_hist, voter_id_groups
        gc.collect()

        df_voters["sparse_history"] = df_voters["all_history"].map(
            insert_code_bin
        )

        # create compound string for unique voter ID from county ID
        df_voters["County_Voter_ID"] = (
            df_voters["County"].str.replace(" ", "").str.lower()
            + "_"
            + df_voters["County_Voter_ID"].astype(int).astype(str)
        )
        df_voters = self.config.coerce_dates(df_voters)
        df_voters = self.config.coerce_numeric(
            df_voters,
            extra_cols=[
                "Zip",
                "Phone",
                "Congressional_District",
                "Senate_District",
                "Assembly_District",
                "Education_District",
                "Regent_District",
                "Registered_Precinct",
            ],
        )
        df_voters = self.config.coerce_strings(df_voters)

        # standardize district data - over time these have varied from:
        #   "1" vs. "district 1" vs "cd1"/"sd1"/"ad1"
        digits = re.compile("\d+")
        def get_district_number_str(x):
            try:
                s = digits.search(x)
            except TypeError:
                return None
            if s is not None:
                return s.group()
            else:
                return None

        df_voters["Congressional_District"] = (
            df_voters["Congressional_District"].map(ensure_int_string)
        )
        df_voters["Senate_District"] = (
            df_voters["Senate_District"].map(ensure_int_string)
        )
        df_voters["Assembly_District"] = (
            df_voters["Assembly_District"].map(ensure_int_string)
        )
        df_voters["Congressional_District"] = (
            df_voters["Congressional_District"].map(get_district_number_str)
        )
        df_voters["Senate_District"] = (
            df_voters["Senate_District"].map(get_district_number_str)
        )
        df_voters["Assembly_District"] = (
            df_voters["Assembly_District"].map(get_district_number_str)
        )

        self.meta = {
            "message": "nevada_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        csv_obj = df_voters.to_csv(encoding="utf-8", index=False)
        del df_voters
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()

Exemplo n.º 12

0

Exibir arquivo

Arquivo: ohio_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(file_obj=self.main_file)

        if not self.ignore_checks:
            self.file_check(len(new_files))

        for i in new_files:
            logging.info("Loading file {}".format(i))
            if "_22" in i["name"]:
                df = self.read_csv_count_error_lines(
                    i["obj"],
                    encoding="latin-1",
                    compression="gzip",
                    error_bad_lines=False,
                )
            elif ".txt" in i["name"]:
                temp_df = self.read_csv_count_error_lines(
                    i["obj"],
                    encoding="latin-1",
                    compression="gzip",
                    error_bad_lines=False,
                )
                df = pd.concat([df, temp_df], axis=0)

        # create history meta data
        voting_history_cols = list(
            filter(
                lambda x: any(
                    [pre in x for pre in ("GENERAL-", "SPECIAL-", "PRIMARY-")]
                ),
                df.columns.values,
            )
        )
        self.column_check(list(set(df.columns) - set(voting_history_cols)))
        total_records = df.shape[0]
        sorted_codes = voting_history_cols
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(total_records - df[k].isna().sum()),
                "date": date_from_str(k),
            }
            for i, k in enumerate(voting_history_cols)
        }

        # ensure district and other numeric fields are e.g. "1" not "1.0"
        df["CONGRESSIONAL_DISTRICT"] = (
            df["CONGRESSIONAL_DISTRICT"].map(ensure_int_string)
        )
        df["STATE_REPRESENTATIVE_DISTRICT"] = (
            df["STATE_REPRESENTATIVE_DISTRICT"].map(ensure_int_string)
        )
        df["STATE_SENATE_DISTRICT"] = (
            df["STATE_SENATE_DISTRICT"].map(ensure_int_string)
        )
        df["COURT_OF_APPEALS"] = (
            df["COURT_OF_APPEALS"].map(ensure_int_string)
        )
        df["STATE_BOARD_OF_EDUCATION"] = (
            df["STATE_BOARD_OF_EDUCATION"].map(ensure_int_string)
        )
        df["RESIDENTIAL_ZIP"] = (
            df["RESIDENTIAL_ZIP"].map(ensure_int_string)
        )
        df["RESIDENTIAL_ZIP_PLUS4"] = (
            df["RESIDENTIAL_ZIP_PLUS4"].map(ensure_int_string)
        )
        df["MAILING_ZIP"] = (
            df["MAILING_ZIP"].map(ensure_int_string)
        )
        df["MAILING_ZIP_PLUS4"] = (
            df["MAILING_ZIP_PLUS4"].map(ensure_int_string)
        )

        self.meta = {
            "message": "ohio_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 13

0

Exibir arquivo

Arquivo: virginia_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(
            file_obj=self.main_file, compression="unzip"
        )

        # throw exception if missing one of the two files needed for processing
        valid_files = []
        for file in new_files:
            valid_files.append(file["name"].lower())

        if not self.ignore_checks:
            self.file_check(len(new_files))
        # faster to just join them into a tab separated string
        valid_files = "\t".join(valid_files)
        if "history" not in valid_files or "registered" not in valid_files:
            raise ValueError("must supply both history and voter file")

        for f in new_files:
            if "history" in f["name"].lower():
                logging.info("vote history found")
                hist_df = self.read_csv_count_error_lines(
                    f["obj"], error_bad_lines=False, encoding="ISO-8859-1"
                )
            elif "registered" in f["name"].lower():
                logging.info("voter file found")
                voters_df = self.read_csv_count_error_lines(
                    f["obj"], error_bad_lines=False, encoding="ISO-8859-1"
                )
        voters_df[self.config["party_identifier"]] = np.nan
        self.column_check(list(voters_df.columns))
        voters_df = self.config.coerce_strings(voters_df)
        voters_df = self.config.coerce_numeric(
            voters_df,
            extra_cols=[
                "TOWNPREC_CODE_VALUE",
                "SUPERDIST_CODE_VALUE",
                "HOUSE_NUMBER",
                "MAILING_ZIP",
            ],
        )
        voters_df = self.config.coerce_dates(voters_df)

        hist_df["combined_name"] = (
            hist_df["ELECTION_NAME"].str.replace(" ", "_").str.lower()
            + "_"
            + hist_df["ELECTION_DATE"]
        )

        # Gathers the votetype columns that are initially boolean and replaces them with the word version of their name
        # collect all the columns where the value is True, combine to one votetype history separated by underscores
        # parsing in features will pull out the appropriate string
        hist_df["votetype_history"] = np.where(
            hist_df["VOTE_IN_PERSON"], "inPerson_", ""
        )
        hist_df["votetype_history"] += np.where(
            hist_df["PROTECTED"], "protected_", ""
        )
        hist_df["votetype_history"] += np.where(
            hist_df["ABSENTEE"], "absentee_", ""
        )
        hist_df["votetype_history"] += np.where(
            hist_df["PROVISIONAL"], "provisional_", ""
        )
        # replace the empty strings with nan for cleaner db cell values
        hist_df["votetype_history"].replace("", np.nan, inplace=True)

        sorted_codes = hist_df["combined_name"].unique().tolist()
        sorted_codes.sort(
            key=lambda x: datetime.strptime(x.split("_")[-1], "%m/%d/%Y")
        )
        counts = hist_df["combined_name"].value_counts()

        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts.loc[k]),
                "date": k.split("_")[-1],
            }
            for i, k in enumerate(sorted_codes)
        }

        def insert_code_bin(arr):
            if isinstance(arr, list):
                return [sorted_codes_dict[k]["index"] for k in arr]
            else:
                return np.nan

        voters_df = voters_df.set_index("IDENTIFICATION_NUMBER", drop=False)
        voter_id_groups = hist_df.groupby("IDENTIFICATION_NUMBER")
        voters_df["all_history"] = voter_id_groups["combined_name"].apply(list)
        voters_df["sparse_history"] = voters_df["all_history"].map(
            insert_code_bin
        )
        voters_df["election_type_history"] = voter_id_groups[
            "ELECTION_TYPE"
        ].apply(list)
        voters_df["party_history"] = voter_id_groups[
            "PRIMARY_TYPE_CODE_NAME"
        ].apply(list)
        voters_df["votetype_history"] = voter_id_groups[
            "votetype_history"
        ].apply(list)
        gc.collect()

        self.meta = {
            "message": "virginia_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(voters_df.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 14

0

Exibir arquivo

Arquivo: colorado_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(compression="unzip",
                                      file_obj=self.main_file)
        df_voter = pd.DataFrame(columns=self.config.raw_file_columns())
        df_hist = pd.DataFrame(columns=self.config["hist_columns"])
        df_master_voter = pd.DataFrame(
            columns=self.config["master_voter_columns"])
        master_vf_version = True

        def master_to_reg_df(df):
            df.columns = self.config["master_voter_columns"]
            df["STATUS"] = df["VOTER_STATUS"]
            df["PRECINCT"] = df["PRECINCT_CODE"]
            df["VOTER_NAME"] = (df["LAST_NAME"] + ", " + df["FIRST_NAME"] +
                                " " + df["MIDDLE_NAME"])
            df = pd.concat(
                [df,
                 pd.DataFrame(columns=self.config["blacklist_columns"])])
            df = df[self.config.processed_file_columns()]
            return df

        for i in new_files:
            if "Registered_Voters_List" in i["name"]:
                master_vf_version = False

        for i in new_files:
            if "Public" not in i["name"]:

                if ("Registered_Voters_List" in i["name"]
                        and not master_vf_version):
                    logging.info("reading in {}".format(i["name"]))
                    # Colorado has a couple different encodings they send us, the format that is detected as ascii will
                    # error out if not read in as latin-1
                    # The format that is typically detected as utf-8-sig needs to have the index col explicitly set to
                    # false, or else pandas will attempt to read the voterid column
                    # in as the index and the history won't apply
                    encoding_result = chardet.detect(i["obj"].read(10000))
                    if encoding_result["encoding"] == "ascii":
                        encoding = "latin-1"
                        index_col = None
                    else:
                        encoding = encoding_result["encoding"]
                        index_col = False
                    i["obj"].seek(0)
                    df_voter = pd.concat(
                        [
                            df_voter,
                            self.read_csv_count_error_lines(
                                i["obj"],
                                encoding=encoding,
                                error_bad_lines=False,
                                index_col=index_col,
                            ),
                        ],
                        axis=0,
                    )

                elif ("Voting_History"
                      in i["name"]) or ("Coordinated_Voter_Details"
                                        in i["name"]):
                    if "Voter_Details" not in i["name"]:
                        logging.info("reading in {}".format(i["name"]))
                        new_df = self.read_csv_count_error_lines(
                            i["obj"],
                            compression="gzip",
                            error_bad_lines=False)
                        df_hist = pd.concat([df_hist, new_df], axis=0)

                    if "Voter_Details" in i["name"] and master_vf_version:
                        logging.info("reading in {}".format(i["name"]))
                        new_df = self.read_csv_count_error_lines(
                            i["obj"],
                            compression="gzip",
                            error_bad_lines=False)
                        if len(new_df.columns) < len(
                                self.config["master_voter_columns"]):
                            new_df.insert(10, "PHONE_NUM", np.nan)
                        try:
                            new_df.columns = self.config[
                                "master_voter_columns"]
                        except ValueError:
                            logging.info(
                                "Incorrect number of columns found for Colorado for file: {}"
                                .format(i["name"]))
                            raise MissingNumColumnsError(
                                "{} state is missing columns".format(
                                    self.state),
                                self.state,
                                len(self.config["master_voter_columns"]),
                                len(new_df.columns),
                            )
                        df_master_voter = pd.concat([df_master_voter, new_df],
                                                    axis=0)

        if df_voter.empty:
            df_voter = master_to_reg_df(df_master_voter)
        if df_hist.empty:
            raise ValueError("must supply a file containing voter history")
        df_hist["VOTING_METHOD"] = df_hist["VOTING_METHOD"].replace(np.nan, "")
        df_hist["ELECTION_DATE"] = pd.to_datetime(df_hist["ELECTION_DATE"],
                                                  format="%m/%d/%Y",
                                                  errors="coerce")
        df_hist.dropna(subset=["ELECTION_DATE"], inplace=True)
        df_hist["election_name"] = (df_hist["ELECTION_DATE"].astype(str) +
                                    "_" + df_hist["VOTING_METHOD"])

        valid_elections, counts = np.unique(df_hist["election_name"],
                                            return_counts=True)

        date_order = [
            idx for idx, election in sorted(
                enumerate(valid_elections),
                key=lambda x: datetime.strptime(x[1][0:10], "%Y-%m-%d"),
                reverse=True,
            )
        ]
        valid_elections = valid_elections[date_order]
        counts = counts[date_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }

        df_hist["array_position"] = df_hist["election_name"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))

        logging.info("Colorado: history apply")
        voter_groups = df_hist.groupby(self.config["voter_id"])
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["VOTING_METHOD"].apply(list)

        df_voter.dropna(subset=[self.config["voter_id"]], inplace=True)
        df_voter = df_voter.set_index(self.config["voter_id"])
        df_voter.sort_index(inplace=True)

        df_voter["all_history"] = all_history
        df_voter["vote_type"] = vote_type
        gc.collect()

        # at some point mailing address field names changed
        for num in ["1", "2", "3"]:
            if f"MAIL_ADDR{num}" in df_voter.columns:
                # if both are present, combine them
                if f"MAILING_ADDRESS_{num}" in df_voter.columns:
                    df_voter[f"MAILING_ADDRESS_{num}"] = np.where(
                        df_voter[f"MAILING_ADDRESS_{num}"].isnull(),
                        df_voter[f"MAIL_ADDR{num}"],
                        df_voter[f"MAILING_ADDRESS_{num}"],
                    )
                else:
                    df_voter[f"MAILING_ADDRESS_{num}"] = df_voter[
                        f"MAIL_ADDR{num}"]
                df_voter.drop(columns=[f"MAIL_ADDR{num}"], inplace=True)

        df_voter = self.config.coerce_strings(df_voter)
        df_voter = self.config.coerce_dates(df_voter)
        df_voter = self.config.coerce_numeric(
            df_voter,
            extra_cols=[
                "HOUSE_NUM",
                "UNIT_NUM",
                "RESIDENTIAL_ZIP_CODE",
                "RESIDENTIAL_ZIP_PLUS",
                "MAILING_ZIP_CODE",
                "MAILING_ZIP_PLUS",
                "PRECINCT_NAME",
                "PRECINCT",
                "MAILING_ADDRESS_3",
                "PHONE_NUM",
            ],
        )

        self.meta = {
            "message": "Colorado_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        gc.collect()
        logging.info("Colorado: writing out")
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8")),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 15

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(file_obj=self.main_file,
                                      compression="unzip")
        new_files = [f for f in new_files if "LEGEND.xlsx" not in f["name"]]

        combined_file = self.concat_file_segments(new_files)

        main_df = self.read_csv_count_error_lines(combined_file,
                                                  error_bad_lines=False)

        voting_action_cols = list(
            filter(lambda x: "party_voted" in x, main_df.columns.values))
        voting_method_cols = list(
            filter(lambda x: "voting_method" in x, main_df.columns.values))
        all_voting_history_cols = voting_action_cols + voting_method_cols

        main_df["all_history"] = df_to_postgres_array_string(
            main_df, voting_action_cols)
        main_df["all_voting_methods"] = df_to_postgres_array_string(
            main_df, voting_method_cols)
        main_df[self.config["birthday_identifier"]] = pd.to_datetime(
            main_df[self.config["birthday_identifier"]].fillna(-1).astype(
                int).astype(str),
            format=self.config["date_format"],
            errors="coerce",
        )
        elections_key = [c.split("_")[-1] for c in voting_action_cols]

        main_df.drop(all_voting_history_cols, axis=1, inplace=True)

        main_df.columns = main_df.columns.str.strip(" ")
        main_df = self.config.coerce_numeric(
            main_df,
            extra_cols=[
                "text_mail_zip5",
                "text_mail_zip4",
                "text_phone_last_four",
                "text_phone_exchange",
                "text_phone_area_code",
                "precinct_part_text_name",
                "precinct_part",
                "occupation",
                "text_mail_carrier_rte",
                "text_res_address_nbr",
                "text_res_address_nbr_suffix",
                "text_res_unit_nbr",
                "text_res_carrier_rte",
                "text_mail_address1",
                "text_mail_address2",
                "text_mail_address3",
                "text_mail_address4",
            ],
        )
        self.meta = {
            "message": "arizona_{}".format(datetime.now().isoformat()),
            "array_dates": json.dumps(elections_key),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(main_df.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 16

0

Exibir arquivo

Arquivo: new_jersey_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(file_obj=self.main_file)
        config = config_file
        voter_files = [n for n in new_files if "AlphaVoter" in n["name"]]

        hist_files = [n for n in new_files if "History" in n["name"]]
        vdf = pd.DataFrame()
        hdf = pd.DataFrame()
        for f in voter_files:
            logging.info("Reading " + f["name"])
            new_df = self.read_csv_count_error_lines(
                f["obj"],
                sep="|",
                names=config["ordered_columns"],
                low_memory=False,
                error_bad_lines=False,
            )
            new_df = self.config.coerce_dates(new_df)
            new_df = self.config.coerce_numeric(
                new_df, extra_cols=["regional_school", "fire", "apt_no"])
            vdf = pd.concat([vdf, new_df], axis=0)
        for f in hist_files:
            logging.info("Reading " + f["name"])
            new_df = self.read_csv_count_error_lines(
                f["obj"],
                sep="|",
                names=config["hist_columns"],
                index_col=False,
                low_memory=False,
                error_bad_lines=False,
            )
            new_df = self.config.coerce_numeric(new_df,
                                                col_list="hist_columns_type")
            hdf = pd.concat([hdf, new_df], axis=0)

        hdf["election_name"] = (hdf["election_name"] + " " +
                                hdf["election_date"])
        hdf = self.config.coerce_dates(hdf, col_list="hist_columns_type")
        hdf.sort_values("election_date", inplace=True)
        hdf = hdf.dropna(subset=["election_name"])
        hdf = hdf.reset_index()
        elections = hdf["election_name"].unique().tolist()
        counts = hdf["election_name"].value_counts()
        elec_dict = {
            k: {
                "index": i,
                "count": int(counts.loc[k]) if k in counts else 0
            }
            for i, k in enumerate(elections)
        }
        vdf["unabridged_status"] = vdf["status"]
        vdf.loc[(vdf["status"] == "Inactive Confirmation")
                | (vdf["status"] == "Inactive Confirmation-Need ID"),
                "status", ] = "Inactive"
        vdf["tmp_id"] = vdf["voter_id"]
        vdf = vdf.set_index("tmp_id")

        hdf_id_group = hdf.groupby("voter_id")
        logging.info("Creating all_history array")
        vdf["all_history"] = hdf_id_group["election_name"].apply(list)
        logging.info("Creating party_history array")
        vdf["party_history"] = hdf_id_group["party_code"].apply(list)

        def insert_code_bin(arr):
            if arr is np.nan:
                return []
            else:
                return [elec_dict[k]["index"] for k in arr]

        vdf["sparse_history"] = vdf["all_history"].apply(insert_code_bin)
        vdf.loc[vdf[self.config["birthday_identifier"]] < pd.
                to_datetime("1900-01-01"),
                self.config["birthday_identifier"], ] = pd.NaT

        self.meta = {
            "message": "new_jersey_{}".format(datetime.now().isoformat()),
            "array_encoding": elec_dict,
            "array_decoding": elections,
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(vdf.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 17

0

Exibir arquivo

Arquivo: minnesota_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        logging.info("Minnesota: loading voter file")
        new_files = self.unpack_files(compression="unzip",
                                      file_obj=self.main_file)

        if not self.ignore_checks:
            self.file_check(len(new_files))
        voter_reg_df = pd.DataFrame(columns=self.config["ordered_columns"])
        voter_hist_df = pd.DataFrame(columns=self.config["hist_columns"])
        for i in new_files:
            if "election" in i["name"].lower():
                voter_hist_df = pd.concat(
                    [
                        voter_hist_df,
                        self.read_csv_count_error_lines(i["obj"],
                                                        error_bad_lines=False),
                    ],
                    axis=0,
                )
            elif "voter" in i["name"].lower():
                voter_reg_df = pd.concat(
                    [
                        voter_reg_df,
                        self.read_csv_count_error_lines(i["obj"],
                                                        encoding="latin-1",
                                                        error_bad_lines=False),
                    ],
                    axis=0,
                )
        voter_reg_df[self.config["voter_status"]] = np.nan
        voter_reg_df[self.config["party_identifier"]] = np.nan

        # if the dataframes are assigned columns to begin with, there will be nans due to concat if the columns are off
        self.column_check(list(voter_reg_df.columns))

        voter_reg_df["DOBYear"] = voter_reg_df["DOBYear"].astype(str).str[0:4]

        voter_hist_df["election_name"] = (voter_hist_df["ElectionDate"] + "_" +
                                          voter_hist_df["VotingMethod"])
        valid_elections, counts = np.unique(voter_hist_df["election_name"],
                                            return_counts=True)
        date_order = [
            idx for idx, election in sorted(
                enumerate(valid_elections),
                key=lambda x: datetime.strptime(x[1][:-2], "%m/%d/%Y"),
                reverse=True,
            )
        ]
        valid_elections = valid_elections[date_order]
        counts = counts[date_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }

        voter_hist_df["array_position"] = voter_hist_df["election_name"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))

        logging.info("Minnesota: history apply")
        voter_groups = voter_hist_df.groupby("VoterId")
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["VotingMethod"].apply(list)

        voter_reg_df = voter_reg_df.set_index(self.config["voter_id"])

        voter_reg_df["all_history"] = all_history
        voter_reg_df["vote_type"] = vote_type
        gc.collect()

        voter_reg_df = self.config.coerce_strings(voter_reg_df)
        voter_reg_df = self.config.coerce_dates(voter_reg_df)
        voter_reg_df = self.config.coerce_numeric(voter_reg_df)

        self.meta = {
            "message": "minnesota_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        gc.collect()
        logging.info("Minnesota: writing out")

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(voter_reg_df.to_csv(encoding="utf-8")),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 18

0

Exibir arquivo

Arquivo: arizona2_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        def file_is_active(filename):
            for word in ["Canceled", "Suspense", "Inactive"]:
                if word in filename:
                    return False
            return True

        def add_files_to_main_df(main_df, file_list):
            alias_dict = self.config["column_aliases"]
            for f in file_list:
                if f["name"].split(".")[-1] == "csv":
                    new_df = self.read_csv_count_error_lines(
                        f["obj"], error_bad_lines=False)
                else:
                    new_df = pd.read_excel(f["obj"])

                for c in new_df.columns:
                    # files vary in consistent use of spaces in headers,
                    # and some headers have different aliases for headers
                    if c.replace(" ", "") in alias_dict.keys():
                        new_df.rename(
                            columns={c: alias_dict[c.replace(" ", "")]},
                            inplace=True,
                        )
                    else:
                        new_df.rename(columns={c: c.replace(" ", "")},
                                      inplace=True)
                new_df.rename(columns={"YearofBirth": "DOB"}, inplace=True)
                main_df = pd.concat([main_df, new_df], sort=False)
            return main_df

        def insert_code_bin(arr):
            return [sorted_codes_dict[k]["index"] for k in arr]

        new_files = self.unpack_files(file_obj=self.main_file,
                                      compression="unzip")

        active_files = [f for f in new_files if file_is_active(f["name"])]
        other_files = [f for f in new_files if not file_is_active(f["name"])]

        main_df = pd.DataFrame()
        main_df = add_files_to_main_df(main_df, active_files)
        main_df = add_files_to_main_df(main_df, other_files)
        main_df.reset_index(drop=True, inplace=True)

        main_df = self.config.coerce_dates(main_df)
        main_df = self.config.coerce_strings(main_df)
        main_df = self.config.coerce_numeric(
            main_df,
            extra_cols=[
                "HouseNumber",
                "UnitNumber",
                "ResidenceZip",
                "MailingZip",
                "Phone",
                "PrecinctPart",
                "VRAZVoterID",
            ],
        )
        voter_columns = [c for c in main_df.columns if not c[0].isdigit()]
        history_columns = [c for c in main_df.columns if c[0].isdigit()]

        self.column_check(voter_columns)
        to_normalize = history_columns + [
            self.config["party_identifier"],
            self.config["voter_status"],
        ]
        for c in to_normalize:
            s = main_df[c].astype(str).str.strip().str.lower()
            s = s.str.encode("utf-8", errors="ignore").str.decode("utf-8")
            main_df.loc[(~main_df[c].isna()), c] = s.loc[(~main_df[c].isna())]
        for c in history_columns:
            main_df[c] = main_df[c].str.replace(" - ", "_")

        main_df[self.config["party_identifier"]] = main_df[
            self.config["party_identifier"]].map(
                lambda x: self.config["party_aliases"][x]
                if x in self.config["party_aliases"] else x)

        # handle history:
        sorted_codes = history_columns[::-1]
        hist_df = main_df[sorted_codes]
        voter_df = main_df[voter_columns]
        counts = (~hist_df.isna()).sum()
        sorted_codes_dict = {
            k: {
                "index": int(i),
                "count": int(counts[i]),
                "date": date_from_str(k),
            }
            for i, k in enumerate(sorted_codes)
        }

        hist_df.loc[:, "vote_codes"] = pd.Series(hist_df.values.tolist())
        hist_df.loc[:, "vote_codes"] = hist_df.loc[:, "vote_codes"].map(
            lambda x: [c for c in x if not pd.isna(c)])
        voter_df.loc[:, "votetype_history"] = hist_df.loc[:, "vote_codes"].map(
            lambda x: [c.split("_")[0] for c in x])
        voter_df.loc[:, "party_history"] = hist_df.loc[:, "vote_codes"].map(
            lambda x: [
                c.split("_")[1] if len(c.split("_")) > 1 else self.config[
                    "no_party_affiliation"] for c in x
            ])

        hist_df.drop(columns=["vote_codes"], inplace=True)
        for c in hist_df.columns:
            hist_df.loc[:, c] = hist_df.loc[:, c].map(
                lambda x: c if not pd.isna(x) else np.nan)
        voter_df.loc[:, "all_history"] = pd.Series(hist_df.values.tolist())
        voter_df.loc[:, "all_history"] = voter_df.loc[:, "all_history"].map(
            lambda x: [c for c in x if not pd.isna(c)])
        voter_df.loc[:, "sparse_history"] = voter_df.loc[:, "all_history"].map(
            insert_code_bin)

        expected_cols = (self.config["ordered_columns"] +
                         self.config["ordered_generated_columns"])
        voter_df = self.reconcile_columns(voter_df, expected_cols)
        voter_df = voter_df[expected_cols]

        self.meta = {
            "message": "arizona2_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(voter_df.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 19

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(file_obj=self.main_file,
                                      compression="unzip")

        if not self.ignore_checks:
            self.file_check(len(new_files))

        preferred_files = [x for x in new_files if (".txt" in x["name"])]
        if len(preferred_files) > 0:
            main_file = preferred_files[0]
        else:
            main_file = new_files[0]

        chardata = main_file["obj"].read(100000)
        result = chardet.detect(chardata)
        encoding_result = result["encoding"]
        main_file["obj"].seek(0)
        logging.info("encoding: {}".format(encoding_result))

        if encoding_result == "ascii":
            # the ascii format causes an pandas decoding error, latin-1 is the correct encoding
            encoding_result = "latin-1"
        else:
            # utf-8 is the pandas default and is the encoding for some files, usually the comma-separated files but not
            # always
            encoding_result = "utf-8"
        file_sep = "\t"
        # Some files are tab separated and some are comma separated,
        # if encoding_result != 'latin-1':
        try:
            wi_columns = pd.read_csv(
                main_file["obj"],
                sep=file_sep,
                nrows=0,
                encoding=encoding_result,
            ).columns.tolist()
        except UnicodeDecodeError:
            # Attempt Latin-1
            main_file["obj"].seek(0)
            wi_columns = pd.read_csv(main_file["obj"],
                                     sep=file_sep,
                                     nrows=0,
                                     encoding="latin-1").columns.tolist()
            encoding_result = "latin-1"

        # If the number of columns is only 1, the wrong was used and it is (probably) a csv file, read in the correct
        # columns with the encoding that works
        if len(wi_columns) == 1:
            main_file["obj"].seek(0)
            file_sep = ","
            wi_columns = pd.read_csv(
                main_file["obj"],
                sep=file_sep,
                nrows=0,
                encoding=encoding_result,
            ).columns.tolist()

        main_file["obj"].seek(0)
        cat_columns = self.config["categorical_columns"]

        # Helper function to determine if a column is a history column by checking it for a date
        def parse_histcols(col_name):
            try:
                parser.parse(col_name)
                return True
            except ValueError:
                return False

        # iterate through the dataframe, each column election column for wisconsin
        # has a monthname and year
        valid_elections = []
        for column in wi_columns:
            if parse_histcols(column):
                valid_elections.append(column)

        cat_columns.extend(valid_elections)

        # Specify categorical columns to save memory
        dtype_dict = {
            col: ("str" if col not in cat_columns else "category")
            for col in wi_columns
        }

        # Wisconsin comes in two slightly different formats
        main_df = self.read_csv_count_error_lines(
            main_file["obj"],
            sep=file_sep,
            encoding=encoding_result,
            dtype=dtype_dict,
            error_bad_lines=False,
        )
        del self.main_file, self.temp_files, new_files
        gc.collect()

        # convert "Voter Status" to "voter_status" for backward compatibility
        main_df.rename(columns={"Voter Status": self.config["voter_status"]},
                       inplace=True)
        # drop rows with nan values for voterid and county
        main_df.dropna(subset=["Voter Reg Number", "County"], inplace=True)

        # remove the non digit voterid's to account for corrupted data (ie dates or names that wound up in the voter
        # id column
        main_df = main_df[main_df["Voter Reg Number"].astype(
            str).str.isdigit()]

        # standardize LaCrosse County and rename it to La Crosse County
        main_df.loc[main_df["County"].str.lower() == "lacrosse county",
                    "County"] = "La Crosse County"
        gc.collect()
        # dummy columns for party and birthday
        main_df[self.config["party_identifier"]] = np.nan
        main_df[self.config["birthday_identifier"]] = np.datetime64("NaT")

        logging.info("dataframe memory usage: {}".format(
            main_df.memory_usage(deep=True).sum() // 1024**3))

        self.column_check(list(set(main_df.columns) - set(valid_elections)))
        # sort from oldest election available to newest
        valid_elections = sorted(valid_elections,
                                 key=lambda date: parser.parse(date))

        # election_counts: a pd series of the valid elections the the vote counts per election
        election_counts = main_df[valid_elections].count()
        # returns the decreasing counts of people who voted per election

        # election_counts.index[i] contains the election "name"
        # k contains the count of people who voted in that elections
        sorted_codes_dict = {
            election_counts.index[i]: {
                "index":
                i,
                "count":
                k,
                "date":
                str(
                    datetime.strptime(election_counts.index[i],
                                      "%B%Y").date().strftime("%m/%d/%Y")),
            }
            for i, k in enumerate(election_counts)
        }

        sorted_codes = list(election_counts.index)

        def insert_codes_bin(row):
            sparse_hist = []
            votetype_hist = []
            all_hist = []
            for i, k in row.iteritems():
                if pd.notnull(k):
                    sparse_hist.append(sorted_codes_dict[i]["index"])
                    type_hist = k.replace(" ", "")
                    votetype_hist.append(type_hist)
                    all_hist.append(i)
            return sparse_hist, votetype_hist, all_hist

        main_df[["sparse_history", "votetype_history", "all_history"
                 ]] = main_df[valid_elections].apply(insert_codes_bin,
                                                     axis=1,
                                                     result_type="expand")
        main_df.drop(columns=valid_elections, inplace=True)
        gc.collect()

        main_df = self.config.coerce_numeric(
            main_df, extra_cols=["HouseNumber", "ZipCode", "UnitNumber"])
        main_df = self.config.coerce_dates(main_df)
        main_df = self.config.coerce_strings(main_df)

        self.meta = {
            "message": "wisconsin_{}".format(datetime.now().isoformat()),
            "array_encoding": sorted_codes_dict,
            "array_decoding": sorted_codes,
        }

        logging.info("Wisconsin: writing out")
        df_csv = main_df.to_csv(encoding="utf-8", index=False)
        del main_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_csv),
            s3_bucket=self.s3_bucket,
        )
        del df_csv
        gc.collect()

Exemplo n.º 20

0

Exibir arquivo

Arquivo: washington_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = [
            n for n in self.unpack_files(self.main_file, compression="unzip")
            if ("pdf" not in n["name"].lower())
        ]

        # there should be only one voter file
        voter_file = [n for n in new_files if "vrdb" in n["name"].lower()][0]
        hist_files = [n for n in new_files if "history" in n["name"].lower()]

        if not self.ignore_checks:
            # We're already automatically limiting voter_file to one entry
            self.file_check(len([voter_file]), hist_files=len(hist_files))

        # There are two possible separators. Detect it first.
        line = voter_file["obj"].readline().decode()
        delimiter = detect(line)
        # Return to the beginning of the buffer to read the data now that we
        # know what the separator is.
        voter_file["obj"].seek(0)
        df_voter = pd.read_csv(voter_file["obj"],
                               sep=delimiter,
                               encoding="latin-1",
                               dtype=str,
                               error_bad_lines=False)

        df_hist = pd.DataFrame()
        for hist_file in hist_files:
            line = hist_file["obj"].readline().decode()
            delimiter = detect(line)
            hist_file["obj"].seek(0)
            temp = pd.read_csv(hist_file["obj"],
                               sep=delimiter,
                               encoding="latin-1",
                               dtype=str)
            df_hist = df_hist.append(temp, ignore_index=True)

        # --- handling the voter history file --- #

        # Need to fix/combine the differently named VoterHistoryID
        # and VotingHistoryID columns
        if {"VotingHistoryID", "VoterHistoryID"}.issubset(df_hist.columns):
            df_hist["VotingHistoryID"] = (df_hist.pop("VoterHistoryID").fillna(
                df_hist.pop("VotingHistoryID")))

        # can't find voter history documentation in any yaml, hardcoding column name
        election_dates = pd.to_datetime(df_hist.loc[:, "ElectionDate"],
                                        errors="coerce").dt

        elections, counts = np.unique(election_dates.date, return_counts=True)

        def convert_date(k):
            try:
                return k.strftime("%m/%d/%Y")
            except ValueError:
                return "unknown"

        sorted_elections_dict = {
            str(k): {
                "index": i,
                "count": int(counts[i]),
                "date": convert_date(k),
            }
            for i, k in enumerate(elections)
        }
        sorted_elections = list(sorted_elections_dict.keys())

        df_hist.loc[:, "all_history"] = election_dates.date.apply(str)
        df_hist.loc[:, "sparse_history"] = df_hist.loc[:, "all_history"].map(
            lambda x: int(sorted_elections_dict[x]["index"]))
        df_hist.loc[:, "county_history"] = df_hist.loc[:, self.config[
            "primary_locale_identifier"]]

        voter_groups = df_hist.groupby(self.config["voter_id"])
        all_history = voter_groups["all_history"].apply(list)
        sparse_history = voter_groups["sparse_history"].apply(list)
        county_history = voter_groups["county_history"].apply(list)
        df_hist = pd.concat([all_history, sparse_history, county_history],
                            axis=1)

        # --- handling the voter file --- #
        # some columns have become obsolete
        df_voter = df_voter.loc[:,
                                df_voter.columns.isin(self.
                                                      config["column_names"])]
        df_voter = df_voter.set_index(self.config["voter_id"])

        # pandas loads any numeric column with NaN values as floats
        # causing formatting trouble during execute() with a few columns
        # saw this solution in other states (arizona & texas)
        to_numeric = [
            df_voter.loc[:, col].str.isnumeric().all()
            for col in df_voter.columns
        ]
        df_voter.loc[:, to_numeric] = (
            df_voter.loc[:, to_numeric].fillna(-1).astype(int))

        df_voter = self.config.coerce_numeric(df_voter)
        df_voter = self.config.coerce_strings(
            df_voter,
            exclude=[
                self.config["primary_locale_identifier"],
                self.config["voter_id"],
            ],
        )
        df_voter = self.config.coerce_dates(df_voter)

        # add voter history
        df_voter = df_voter.join(df_hist)

        # Add party_idenitfier dummy values,
        # since WA doesn't have party info
        df_voter.loc[:, self.config["party_identifier"]] = NO_PARTY_PLACEHOLDER

        # Need to remap status codes because the original data are messy
        df_voter["StatusCodeOrig"] = df_voter["StatusCode"]
        df_voter["StatusCode"] = df_voter["StatusCodeOrig"].map(
            self.config["status_codes_remap"])
        if df_voter["StatusCode"].isnull().any():
            missing = df_voter[
                df_voter["StatusCode"].isnull()]["StatusCodeOrig"].to_list()
            logging.warning("Status codes missing from status_codes_remap")
            logging.warning(missing)

        # Check for missing columns; catch error because we're fixing them
        # below
        try:
            self.column_check(list(df_voter.columns))
        except MissingColumnsError:
            pass

        # Make sure all columns are present
        expected_cols = (self.config["ordered_columns"] +
                         self.config["ordered_generated_columns"])
        # Remove the index column to avoid duplication
        expected_cols.remove(self.config["voter_id"])

        df_voter = self.reconcile_columns(df_voter, expected_cols)
        df_voter = df_voter[expected_cols]

        self.meta = {
            "message": f"washington_{datetime.now().isoformat()}",
            "array_encoding": json.dumps(sorted_elections_dict),
            "array_decoding": json.dumps(sorted_elections),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8")),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 21

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(
            file_obj=self.main_file, compression="unzip"
        )

        if not self.ignore_checks:
            self.file_check(len(new_files))
        widths_one = [
            3,
            10,
            10,
            50,
            50,
            50,
            50,
            4,
            1,
            8,
            9,
            12,
            2,
            50,
            12,
            2,
            12,
            12,
            50,
            9,
            110,
            50,
            50,
            20,
            20,
            8,
            1,
            1,
            8,
            2,
            3,
            6,
        ]
        widths_two = [
            3,
            4,
            10,
            50,
            50,
            50,
            50,
            4,
            1,
            8,
            9,
            12,
            2,
            50,
            12,
            2,
            12,
            12,
            50,
            9,
            110,
            50,
            50,
            20,
            20,
            8,
            1,
            1,
            8,
            2,
            3,
            6,
        ]
        df_voter = pd.DataFrame(columns=self.config.raw_file_columns())
        df_hist = pd.DataFrame(columns=self.config.raw_file_columns())
        have_length = False
        for i in new_files:
            file_len = i["obj"].seek(SEEK_END)
            i["obj"].seek(SEEK_SET)
            if "count" not in i["name"] and file_len != 0:

                if not have_length:
                    line_length = len(i["obj"].readline())
                    i["obj"].seek(SEEK_END)
                    have_length = True
                    if line_length == 686:
                        widths = widths_one
                    elif line_length == 680:
                        widths = widths_two
                    else:
                        raise ValueError(
                            "Width possibilities have changed,"
                            "new width found: {}".format(line_length)
                        )
                    have_length = True
                logging.info("Loading file {}".format(i))
                new_df = pd.read_fwf(i["obj"], widths=widths, header=None)
                try:
                    new_df.columns = self.config.raw_file_columns()
                except ValueError:
                    logging.info("Incorrect number of columns found for texas")
                    raise MissingNumColumnsError(
                        "{} state is missing columns".format(self.state),
                        self.state,
                        len(self.config.raw_file_columns()),
                        len(new_df.columns),
                    )
                if new_df["Election_Date"].head(n=100).isnull().sum() > 75:
                    df_voter = pd.concat(
                        [df_voter, new_df], axis=0, ignore_index=True
                    )
                else:
                    df_hist = pd.concat(
                        [df_hist, new_df], axis=0, ignore_index=True
                    )
            del i["obj"]
        if df_hist.empty:
            logging.info("This file contains no voter history")
        df_voter["Effective_Date_of_Registration"] = (
            df_voter["Effective_Date_of_Registration"]
            .fillna(-1)
            .astype(int, errors="ignore")
            .astype(str)
            .replace("-1", np.nan)
        )
        df_voter[self.config["party_identifier"]] = "npa"
        df_hist[self.config["hist_columns"]] = df_hist[
            self.config["hist_columns"]
        ].replace(np.nan, "", regex=True)
        df_hist["election_name"] = (
            df_hist["Election_Date"].astype(str)
            + "_"
            + df_hist["Election_Type"].astype(str)
            + "_"
            + df_hist["Election_Party"].astype(str)
        )

        valid_elections, counts = np.unique(
            df_hist["election_name"], return_counts=True
        )

        def texas_datetime(x):
            try:
                return datetime.strptime(x[0:8], "%Y%m%d")
            except (ValueError):
                return datetime(1970, 1, 1)

        date_order = [
            idx
            for idx, election in sorted(
                enumerate(valid_elections),
                key=lambda x: texas_datetime(x[1]),
                reverse=True,
            )
        ]
        valid_elections = valid_elections[date_order]
        counts = counts[date_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": str(texas_datetime(k).date()),
            }
            for i, k in enumerate(sorted_codes)
        }

        df_hist["array_position"] = df_hist["election_name"].map(
            lambda x: int(sorted_codes_dict[x]["index"])
        )
        logging.info("Texas: history apply")
        voter_groups = df_hist.groupby(self.config["voter_id"])
        sparse_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["Election_Voting_Method"].apply(list)

        df_voter = df_voter.set_index(self.config["voter_id"])
        df_voter["sparse_history"] = sparse_history
        df_voter["all_history"] = voter_groups["election_name"].apply(list)
        df_voter["vote_type"] = vote_type
        gc.collect()
        df_voter = self.config.coerce_strings(df_voter)
        df_voter = self.config.coerce_dates(df_voter)
        df_voter = self.config.coerce_numeric(
            df_voter,
            extra_cols=[
                "Permanent_Zipcode",
                "Permanent_House_Number",
                "Mailing_Zipcode",
            ],
        )
        df_voter.drop(self.config["hist_columns"], axis=1, inplace=True)
        self.meta = {
            "message": "texas_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        gc.collect()
        logging.info("Texas: writing out")
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8")),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 22

0

Exibir arquivo

Arquivo: new_hampshire_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(file_obj=self.main_file,
                                      compression="unzip")
        if not self.ignore_checks:
            self.file_check(len(new_files))

        for f in new_files:
            # ignore ".mdb" files
            if (".xlsx" in f["name"]) or (".csv" in f["name"]):

                if "history" in f["name"].lower():
                    logging.info("Found history file: {}".format(f["name"]))
                    if ".xlsx" in f["name"]:
                        hist_df = pd.read_excel(f["obj"])
                    else:
                        hist_df = self.read_csv_count_error_lines(
                            f["obj"], error_bad_lines=False)
                    hist_df.drop_duplicates(inplace=True)

                elif ("checklist" in f["name"].lower()) or (
                        "voters"
                        in f["name"].lower()) or ("voter file"
                                                  in f["name"].lower()):
                    logging.info("Found voter file: {}".format(f["name"]))
                    if ".xlsx" in f["name"]:
                        voters_df = pd.read_excel(f["obj"])
                    else:
                        voters_df = self.read_csv_count_error_lines(
                            f["obj"], error_bad_lines=False)

        # add dummy columns for birthday and voter_status
        voters_df[self.config["birthday_identifier"]] = 0
        voters_df[self.config["voter_status"]] = np.nan

        self.column_check(list(voters_df.columns))
        voters_df = self.config.coerce_strings(voters_df)
        voters_df = self.config.coerce_numeric(
            voters_df, extra_cols=["ad_str3", "mail_str3"])

        # collect histories
        hist_df["combined_name"] = (
            hist_df["election_name"].str.replace(" ", "_").str.lower() + "_" +
            hist_df["election_date"])

        sorted_codes = hist_df["combined_name"].unique().tolist()
        sorted_codes.sort(
            key=lambda x: datetime.strptime(x.split("_")[-1], "%m/%d/%Y"))
        counts = hist_df["combined_name"].value_counts()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts.loc[k]),
                "date": k.split("_")[-1],
            }
            for i, k in enumerate(sorted_codes)
        }

        def insert_code_bin(arr):
            if isinstance(arr, list):
                return [sorted_codes_dict[k]["index"] for k in arr]
            else:
                return np.nan

        voters_df = voters_df.set_index("id_voter", drop=False)
        voter_id_groups = hist_df.groupby("id_voter")
        voters_df["all_history"] = voter_id_groups["combined_name"].apply(list)
        voters_df["sparse_history"] = voters_df["all_history"].map(
            insert_code_bin)
        voters_df["election_type_history"] = voter_id_groups[
            "election_type"].apply(list)
        voters_df["election_category_history"] = voter_id_groups[
            "election_category"].apply(list)
        voters_df["votetype_history"] = voter_id_groups["ballot_type"].apply(
            list)
        voters_df["party_history"] = voter_id_groups["cd_part_voted"].apply(
            list)
        voters_df["town_history"] = voter_id_groups["town"].apply(list)

        self.meta = {
            "message": "new_hampshire_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(voters_df.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 23

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        # config = Config('michigan')
        new_files = self.unpack_files(file_obj=self.main_file)
        del self.main_file, self.temp_files
        gc.collect()

        if not self.ignore_checks:
            self.file_check(len(new_files))
        voter_file = ([
            n for n in new_files if "entire_state_v" in n["name"]
            or "EntireStateVoters" in n["name"]
        ] + [None])[0]
        hist_file = ([
            n for n in new_files if "entire_state_h" in n["name"]
            or "EntireStateVoterHistory" in n["name"]
        ] + [None])[0]
        elec_codes = ([n for n in new_files if "electionscd" in n["name"]] +
                      [None])[0]

        logging.info("Loading voter file: " + voter_file["name"])
        if voter_file["name"][-3:] == "lst":
            vcolspecs = [
                [0, 35],
                [35, 55],
                [55, 75],
                [75, 78],
                [78, 82],
                [82, 83],
                [83, 91],
                [91, 92],
                [92, 99],
                [99, 103],
                [103, 105],
                [105, 135],
                [135, 141],
                [141, 143],
                [143, 156],
                [156, 191],
                [191, 193],
                [193, 198],
                [198, 248],
                [248, 298],
                [298, 348],
                [348, 398],
                [398, 448],
                [448, 461],
                [461, 463],
                [463, 468],
                [468, 474],
                [474, 479],
                [479, 484],
                [484, 489],
                [489, 494],
                [494, 499],
                [499, 504],
                [504, 510],
                [510, 516],
                [516, 517],
                [517, 519],
            ]
            vdf = pd.read_fwf(
                voter_file["obj"],
                colspecs=vcolspecs,
                names=self.config["fwf_voter_columns"],
                na_filter=False,
            )
        elif voter_file["name"][-3:] == "csv":
            vdf = self.read_csv_count_error_lines(
                voter_file["obj"],
                encoding="latin-1",
                na_filter=False,
                error_bad_lines=False,
            )
            # rename 'STATE' field to not conflict with our 'state' field
            vdf.rename(columns={"STATE": "STATE_ADDR"}, inplace=True)
        else:
            raise NotImplementedError("File format not implemented")
        del voter_file
        gc.collect()

        def column_is_empty(col):
            total_size = col.shape[0]
            if (sum(col.isna()) == total_size) or (sum(col == "")):
                return True
            return False

        def fill_empty_columns(df):
            # Dummy values for newly added data fields
            if column_is_empty(df["STATUS_USER_CODE"]):
                df["STATUS_USER_CODE"] = "_"
            if column_is_empty(df["VOTER_ID"]):
                df["VOTER_ID"] = 0
            if column_is_empty(df["STATUS_DATE"]):
                df["STATUS_DATE"] = "1970-01-01 00:00:00"
            return df

        vdf = self.reconcile_columns(vdf, self.config["columns"])
        vdf = fill_empty_columns(vdf)
        vdf = vdf.reindex(columns=self.config["ordered_columns"])
        vdf[self.config["party_identifier"]] = "npa"

        logging.info("Loading history file: " + hist_file["name"])
        if hist_file["name"][-3:] == "lst":
            hcolspecs = [
                [0, 13],
                [13, 15],
                [15, 20],
                [20, 25],
                [25, 38],
                [38, 39],
            ]
            hdf = pd.read_fwf(
                hist_file["obj"],
                colspecs=hcolspecs,
                names=self.config["fwf_hist_columns"],
                na_filter=False,
            )
        elif hist_file["name"][-3:] == "csv":
            hdf = self.read_csv_count_error_lines(hist_file["obj"],
                                                  na_filter=False,
                                                  error_bad_lines=False)
            if ("IS_ABSENTEE_VOTER"
                    not in hdf.columns) and ("IS_PERMANENT_ABSENTEE_VOTER"
                                             in hdf.columns):
                hdf.rename(
                    columns={
                        "IS_PERMANENT_ABSENTEE_VOTER": "IS_ABSENTEE_VOTER"
                    },
                    inplace=True,
                )
        else:
            raise NotImplementedError("File format not implemented")
        del hist_file
        gc.collect()

        # If hdf has ELECTION_DATE (new style) instead of ELECTION_CODE,
        # then we don't need to do election code lookups
        elec_code_dict = dict()
        missing_history_dates = False
        if "ELECTION_DATE" in hdf.columns:
            try:
                hdf["ELECTION_NAME"] = pd.to_datetime(
                    hdf["ELECTION_DATE"]).map(lambda x: x.strftime("%Y-%m-%d"))
            except ValueError:
                missing_history_dates = True
                hdf["ELECTION_NAME"] = hdf["ELECTION_DATE"]
        else:
            if elec_codes:
                # If we have election codes in this file
                logging.info("Loading election codes file: " +
                             elec_codes["name"])
                if elec_codes["name"][-3:] == "lst":
                    ecolspecs = [[0, 13], [13, 21], [21, 46]]
                    edf = pd.read_fwf(
                        elec_codes["obj"],
                        colspecs=ecolspecs,
                        names=self.config["elec_code_columns"],
                        na_filter=False,
                    )
                    edf["Date"] = pd.to_datetime(edf["Date"], format="%m%d%Y")
                elif elec_codes["name"][-3:] == "csv":
                    # I'm not sure if this would actually ever happen
                    edf = self.read_csv_count_error_lines(
                        elec_codes["obj"],
                        names=self.config["elec_code_columns"],
                        na_filter=False,
                        error_bad_lines=False,
                    )
                else:
                    raise NotImplementedError("File format not implemented")

                # make a code dictionary that will be stored with meta data
                for idx, row in edf.iterrows():
                    d = row["Date"].strftime("%Y-%m-%d")
                    elec_code_dict[row["Election_Code"]] = {
                        "Date":
                        d,
                        "Slug":
                        d + "_" + str(row["Election_Code"]) + "_" +
                        row["Title"].replace(" ", "-").replace("_", "-"),
                    }
            else:
                # Get election codes from most recent meta data
                this_date = parser.parse(date_from_str(
                    self.raw_s3_file)).date()
                pre_date, post_date, pre_key, post_key = get_surrounding_dates(
                    this_date,
                    self.state,
                    self.s3_bucket,
                    testing=self.testing)
                if pre_key is not None:
                    nearest_meta = get_metadata_for_key(
                        pre_key, self.s3_bucket)
                    elec_code_dict = nearest_meta["elec_code_dict"]
                    if len(elec_code_dict) == 0:
                        raise MissingElectionCodesError(
                            "No election codes in nearby meta data.")
                else:
                    raise MissingElectionCodesError(
                        "No election code file or nearby meta data found.")

            # Election code lookup
            hdf["ELECTION_NAME"] = hdf["ELECTION_CODE"].map(
                lambda x: elec_code_dict[str(x)]["Slug"]
                if str(x) in elec_code_dict else str(x))

        # Create meta data
        counts = hdf["ELECTION_NAME"].value_counts()
        counts.sort_index(inplace=True)
        sorted_codes = counts.index.to_list()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }

        # Collect histories
        vdf.set_index(self.config["voter_id"], drop=False, inplace=True)
        hdf_id_groups = hdf.groupby(self.config["voter_id"])
        vdf["all_history"] = hdf_id_groups["ELECTION_NAME"].apply(list)
        vdf["votetype_history"] = hdf_id_groups["IS_ABSENTEE_VOTER"].apply(
            list)
        vdf["county_history"] = hdf_id_groups["COUNTY_CODE"].apply(list)
        vdf["jurisdiction_history"] = hdf_id_groups["JURISDICTION_CODE"].apply(
            list)
        vdf["schooldistrict_history"] = hdf_id_groups[
            "SCHOOL_DISTRICT_CODE"].apply(list)
        del hdf, hdf_id_groups
        gc.collect()

        def insert_code_bin(arr):
            if isinstance(arr, list):
                return [
                    sorted_codes_dict[k]["index"] for k in arr
                    if k in sorted_codes_dict
                ]
            else:
                return np.nan

        vdf["sparse_history"] = vdf["all_history"].map(insert_code_bin)

        if missing_history_dates:
            vdf["all_history"] = None
            vdf["sparse_history"] = None

        vdf = self.config.coerce_dates(vdf)
        vdf = self.config.coerce_numeric(
            vdf,
            extra_cols=[
                "PRECINCT",
                "WARD",
                "VILLAGE_PRECINCT",
                "SCHOOL_PRECINCT",
            ],
        )
        vdf = self.config.coerce_strings(vdf)

        self.meta = {
            "message": "michigan_{}".format(datetime.now().isoformat()),
            "array_encoding": sorted_codes_dict,
            "array_decoding": sorted_codes,
            "elec_code_dict": elec_code_dict,
        }

        csv_obj = vdf.to_csv(encoding="utf-8", index=False)
        del vdf
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()

Exemplo n.º 24

0

Exibir arquivo

Arquivo: california_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        def district_fun(df_dist, df_voter, dist_dict):
            for dist_code in dist_dict.keys():
                temp_df = df_dist[df_dist["DistrictTypeCode"] == dist_code]
                temp_df = temp_df.rename(
                    columns={"DistrictName": dist_dict[dist_code]})
                df_voter = pd.merge(
                    df_voter,
                    temp_df[["PrecinctId", dist_dict[dist_code]]],
                    how="left",
                    on="PrecinctId",
                )
            df_voter.drop(columns=["PrecinctId"], inplace=True)
            return df_voter

        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        config = Config(file_name=self.config_file)
        new_files = self.unpack_files(file_obj=self.main_file)

        del self.main_file, self.temp_files
        gc.collect()

        # Have to use longer whole string not just suffix because hist will
        # match to voter file
        voter_file = [f for f in new_files if "pvrdr-vrd" in f["name"]][0]
        district_file = [f for f in new_files if "pvrdr-pd" in f["name"]][0]
        history_file = [f for f in new_files if "pvrdr-vph" in f["name"]][0]

        temp_voter_id_df = pd.read_csv(
            voter_file["obj"],
            sep="\t",
            encoding="latin-1",
            usecols=["RegistrantID"],
            dtype=str,
        )
        # rewind
        voter_file["obj"].seek(0)

        voter_ids = temp_voter_id_df["RegistrantID"].unique().tolist()

        del temp_voter_id_df
        gc.collect()

        hist_dict = {i: np.nan for i in voter_ids}
        votetype_dict = {i: np.nan for i in voter_ids}
        del voter_ids
        gc.collect()

        # key election, values date and count, then sort.
        # gonna have to iterate over all_hist and map to sparse
        elect_dict = defaultdict(int)

        def dict_cols(chunk,
                      history_dict=None,
                      votetype_dict=None,
                      election_dict=None):
            chunk["combined_col"] = (
                chunk["ElectionType"].replace(" ", "") + "_" +
                chunk["ElectionDate"]
                # + "_"
                # + chunk["Method"]
            )
            chunk["election"] = (chunk["ElectionType"].replace(" ", "") + "_" +
                                 chunk["ElectionDate"])
            chunk.drop(
                columns=[
                    "ElectionType",
                    "ElectionName",
                    "ElectionDate",
                    "CountyCode",
                ],
                inplace=True,
            )
            for row in chunk.itertuples():
                try:
                    current_li = hist_dict[row.RegistrantID]
                    votetype_hist = votetype_dict[row.RegistrantID]
                    # throws key error for entries not in voter file
                    election_dict[row.election] += 1
                    combined_row = row.combined_col
                    if isinstance(current_li, list):
                        current_li.append(combined_row)
                        votetype_hist.append(row.Method)
                        history_dict[row.RegistrantID] = current_li
                        votetype_dict[row.RegistrantID] = votetype_hist
                    else:
                        # test_dict[row['RegistrantID']][0]
                        history_dict[row.RegistrantID] = [
                            combined_row
                        ]  # Create list of elections even if len 1
                        votetype_dict[row.RegistrantID] = [row.Method]
                except KeyError:
                    continue

        # Chunk size, over ~3 mil of so leads to slowdown
        chunk_size = 3000000

        history_chunks = pd.read_csv(
            history_file["obj"],
            sep="\t",
            usecols=[
                "RegistrantID",
                "CountyCode",
                "ElectionDate",
                "ElectionName",
                "ElectionType",
                "Method",
            ],
            dtype=str,
            chunksize=chunk_size,
        )
        for chunk in history_chunks:
            dict_cols(chunk, hist_dict, votetype_dict, elect_dict)

        history_file["obj"].close()
        del history_file
        gc.collect()

        hist_series = pd.Series(hist_dict, name="all_history")

        del hist_dict
        gc.collect()

        votetype_series = pd.Series(votetype_dict, name="votetype_history")

        del votetype_dict
        gc.collect()

        logging.info("reading in CA voter df")

        category_list = [
            "CountyCode",
            "Suffix",
            "StreetDirPrefix",
            "AddressNumberSuffix",
            "StreetType",
            "StreetDirSuffix",
            "UnitType",
            "City",
            "State",
            "Zip",
            "Language",
            "Gender",
            "PartyCode",
            "Status",
            "VoterStatusReasonCodeDesc",
            "AssistanceRequestFlag",
            "VbmVoterType",
            "USCongressionalDistrict",
            "StateSenate",
            "Municipality",
            "StateAddr",
        ]
        # read in columns to set dtype as pyarrow
        col_ifornia = pd.read_csv(voter_file["obj"],
                                  sep="\t",
                                  nrows=0,
                                  encoding="latin-1").columns.tolist()

        voter_file["obj"].seek(0)
        dtype_dict = {
            col:
            ("string[pyarrow]" if col not in category_list else "category")
            for col in col_ifornia
        }
        voter_df = pd.read_csv(
            voter_file["obj"],
            sep="\t",
            dtype=dtype_dict,
            encoding="latin-1",
            on_bad_lines="warn",
        )

        # Replaces the state column name in the address fields with StateAddr to avoid duplicate column names
        voter_df.rename(columns={"State": "StateAddr"}, inplace=True)

        logging.info("dataframe memory usage: {}".format(
            round((voter_df.memory_usage(deep=True).sum() / 1024**2), 2)))

        voter_file["obj"].close()
        del voter_file
        gc.collect()

        district_dict = {
            "CG": "USCongressionalDistrict",
            "SS": "StateSenate",
            "SA": "StateAssembly",
            "CI": "Municipality",
            "SU": "CountySupervisoral",
        }
        district_df = pd.read_csv(district_file["obj"],
                                  sep="\t",
                                  dtype="string[pyarrow]")

        district_file["obj"].close()
        del district_file

        merged_districts = district_fun(
            district_df,
            voter_df[["RegistrantID", "PrecinctId"]],
            district_dict,
        )

        voter_df = voter_df.merge(merged_districts,
                                  left_on="RegistrantID",
                                  right_on="RegistrantID")

        del merged_districts
        gc.collect()

        voter_df.set_index("RegistrantID", inplace=True)

        voter_df = voter_df.merge(hist_series,
                                  left_index=True,
                                  right_index=True)

        del hist_series
        gc.collect()

        voter_df = voter_df.merge(votetype_series,
                                  left_index=True,
                                  right_index=True)

        del votetype_series
        gc.collect()

        # create sparse history
        sorted_keys = sorted(elect_dict.items(),
                             key=lambda x: x[0].split("_")[1])
        sorted_codes_dict = {
            value[0]: {
                "index": i,
                "count": value[1]
            }
            for i, value in enumerate(sorted_keys)
        }
        sorted_codes = [x[0] for x in sorted_keys]
        voter_df["sparse_history"] = voter_df.all_history.apply(
            lambda x: [sorted_codes_dict[y]["index"] for y in x]
            if x == x else np.nan)
        # Begin Coerce

        # categories to turn them in to strings
        logging.info("coecrcing strings")
        voter_df = self.coerce_strings(voter_df, config, category_list)

        logging.info("coecrcing dates")
        voter_df = self.config.coerce_dates(voter_df)

        logging.info("coecrcing numeric")
        voter_df = self.config.coerce_numeric(voter_df)

        voter_df = voter_df.reset_index().rename(
            columns={"index": "RegistrantID"})

        voter_csv = voter_df.to_csv(encoding="utf-8", index=False)

        del voter_df
        gc.collect()

        self.meta = {
            "message": "california_{}".format(datetime.now().isoformat()),
            "array_encoding": sorted_codes_dict,
            "array_decoding": sorted_codes,
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(voter_csv),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 25

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file, compression="unzip")

        voter_file = [n for n in new_files
                      if "voter_ex" in n["name"].lower()][0]
        hist_file = [n for n in new_files
                     if "voter_his" in n["name"].lower()][0]

        # --- handling voter history --- #

        df_hist = pd.read_csv(hist_file["obj"], dtype=str).rename(
            {"Voter ID": self.config["voter_id"]}, axis=1)

        election_codes = {
            str(v): k
            for k, v in self.config["election_codes"].items()
        }
        votetype_codes = {
            str(v): k
            for k, v in self.config["votetype_codes"].items()
        }

        df_hist.loc[:, "BALLOTSTAGE/STATUS"] = (
            df_hist.loc[:, "BALLOTSTAGE/STATUS"].map({
                "Processed/Accepted":
                "absentee-ACCEPTED",
                "Sent":
                "absentee-SENT",
                "Processed/Rejected":
                "absentee-REJECTED",
                "Undeliverable":
                "absentee-UNDELIVERABLE",
            }).fillna("non-absentee"))

        df_hist = df_hist.loc[df_hist["BALLOTSTAGE/STATUS"].
                              isin(["non-absentee", "absentee-ACCEPTED"]), :, ]

        df_hist.loc[:, "ELECTION_TYPE"] = df_hist.loc[:, "ELECTION_TYPE"].map(
            election_codes)

        # if the election code does not exist, take a clean version of the election description
        df_hist.loc[df_hist["ELECTION_TYPE"].isna(),
                    "ELECTION_DESCRIPTION"] = (
                        df_hist.loc[df_hist["ELECTION_TYPE"].isna(),
                                    "ELECTION_DESCRIPTION"].str.lower(
                                    ).str.split(" ").str.join("_"))
        # will use later
        election_dates = pd.to_datetime(df_hist.loc[:, "ELECTION_DATE"])
        df_hist.loc[:,
                    "ELECTION_DATE"] = election_dates.dt.strftime("%Y-%m-%d")

        # creating election ids
        df_hist.loc[:,
                    "all_history"] = (election_dates.dt.strftime("%Y_%m_%d_") +
                                      df_hist.loc[:, "ELECTION_TYPE"])
        df_hist.loc[:, "votetype_history"] = df_hist.loc[:, "VVM_ID"].map(
            votetype_codes)
        df_hist.loc[:, "county_history"] = df_hist.loc[:, "JS_CODE"].fillna(0)

        elections = (df_hist.groupby([
            "all_history", "ELECTION_DATE"
        ])[self.config["voter_id"]].count().reset_index().values)

        sorted_elections_dict = {
            k[0]: {
                "index": i,
                "count": int(k[2]),
                "date": str(k[1])
            }
            for i, k in enumerate(elections)
        }
        sorted_elections = list(sorted_elections_dict.keys())

        df_hist.loc[:, "sparse_history"] = df_hist.loc[:, "all_history"].map(
            lambda x: sorted_elections_dict[x]["index"])

        df_hist = df_hist.loc[:, [
            self.config["voter_id"],
            "all_history",
            "votetype_history",
            "county_history",
            "sparse_history",
        ], ]

        df_group = df_hist.groupby(self.config["voter_id"])
        groups = []
        for col in df_hist.columns[1:]:
            group = df_group[col].apply(list)
            groups.append(group)

        df_hist = pd.concat(groups, axis=1)

        # --- handling voter file --- #

        df_voter = pd.read_csv(voter_file["obj"], sep="\t", index_col=False)
        df_voter = self.config.coerce_strings(df_voter)
        df_voter = self.config.coerce_numeric(df_voter)
        df_voter = self.config.coerce_dates(df_voter)

        df_voter = df_voter.set_index(self.config["voter_id"]).join(df_hist)

        self.meta = {
            "message": "montana_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_elections_dict),
            "array_decoding": json.dumps(sorted_elections),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8", index=True)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 26

0

Exibir arquivo

Arquivo: vermont_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        def hist_map(df, columns):
            def mapping(li):
                li = [x for x in li if x != "nan"]
                return li

            return pd.Series(
                map(mapping, df[columns].values.astype(str).tolist()))

        new_files = self.unpack_files(self.main_file, compression="unzip")
        self.file_check(len(new_files))
        voter_file = [
            n for n in new_files
            if "voter file" or "Statewidevoters" in n["name"].lower()
        ][0]
        vdf = pd.read_csv(voter_file["obj"], sep="|", dtype=str)
        unnamed_cols = vdf.columns[vdf.columns.str.contains("Unnamed")]
        vdf.drop(columns=unnamed_cols, inplace=True)
        election_columns = [
            col for col in vdf.columns if "election" in col.lower()
        ]
        vdf[self.config["party_identifier"]] = np.nan

        cols_to_check = [x for x in vdf.columns if x not in election_columns]
        self.column_check(cols_to_check)

        # strip the word "participation" and replace spaces with underscores
        # for consistency
        rename_dict = {
            col: col.replace(" Participation", "").replace(" ", "_")
            for col in election_columns
        }

        vdf.rename(columns=rename_dict, inplace=True)

        election_columns = list(rename_dict.values())
        # Replacing the boolean values in the cells with the election name for
        # processing
        for c in list(rename_dict.values()):
            vdf.loc[:, c] = vdf.loc[:, c].map({
                "T": c.replace(" ", "_"),
                "F": np.nan
            })

        # election_counts is a pandas series containing the general elections
        # as an index how many people voted in each general election
        election_counts = vdf[election_columns].count().sort_index()

        # Iterates through the election series, and extracts the information
        # necessary to create metadata, the index is
        sorted_codes_dict = {
            election_counts.index[i]: {
                "index":
                i,
                "count":
                k,
                "date":
                str(
                    datetime.strptime(election_counts.index[i][:4],
                                      "%Y").date().strftime("%m/%d/%Y")),
            }
            for i, k in enumerate(election_counts)
        }
        sorted_elections = sorted(list(sorted_codes_dict.keys()))
        vdf["all_history"] = hist_map(vdf[election_columns], election_columns)

        def insert_code_bin(arr):
            if isinstance(arr, list):
                return [sorted_codes_dict[k]["index"] for k in arr]
            else:
                return np.nan

        vdf.loc[:,
                "sparse_history"] = vdf.loc[:,
                                            "all_history"].map(insert_code_bin)

        vdf = vdf.set_index(self.config["voter_id"])

        vdf = self.config.coerce_strings(vdf)
        vdf = self.config.coerce_numeric(vdf)
        vdf = self.config.coerce_dates(vdf)

        self.meta = {
            "message": "vermont_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_elections),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(vdf.to_csv(encoding="utf-8", index=True)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 27

0

Exibir arquivo

Arquivo: arkansas_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file)

        voter_file = [n for n in new_files if "vr.csv" == n["name"].lower()][0]
        hist_file = [n for n in new_files if "vh.csv" == n["name"].lower()][0]

        # --- handling the vote history file --- #
        df_hist = pd.read_csv(hist_file["obj"], dtype=str)

        elections = pd.Series(self.config["elections"])
        election_votetype = elections + "HowVoted"
        election_party = elections + "PartyVoted"
        election_county = elections + "CountyVotedIn"

        election_cols = zip(
            *[elections, election_votetype, election_party, election_county])

        election_dfs = []
        for e in election_cols:
            election_df = df_hist.set_index(self.config["voter_id"])
            election_df = election_df.loc[:, election_df.columns.isin(e)]
            election_df = election_df.dropna(how="all")
            election_df.columns = [
                "all_history",
                "county_history",
                "party_history",
                "votetype_history",
            ]
            election_df.loc[:, "all_history"] = e[0]
            election_dfs.append(election_df.reset_index())

        df_hist = pd.concat(election_dfs, ignore_index=True)
        df_hist = df_hist.fillna("NP").applymap(lambda x: x.strip(" "))

        elections, counts = np.unique(df_hist.all_history, return_counts=True)
        order = np.argsort(counts)[::-1]
        counts = counts[order]
        elections = elections[order]
        election_years = list(
            pd.to_datetime(("20" + pd.Series(elections).str.extract(
                "(\d{2}(?!\d))", expand=False))).dt.year)

        sorted_elections_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": str(election_years[i]),
            }
            for i, k in enumerate(elections)
        }
        sorted_elections = list(sorted_elections_dict.keys())

        df_hist.loc[:, "sparse_history"] = df_hist.all_history.map(
            lambda x: int(sorted_elections_dict[x]["index"]))

        group = df_hist.groupby(self.config["voter_id"])
        df_hist = pd.concat(
            [group[col].apply(list) for col in df_hist.columns[1:]], axis=1)

        # --- handling the voter file --- #
        df_voter = pd.read_csv(voter_file["obj"], dtype=str)

        df_voter = self.config.coerce_dates(df_voter)
        df_voter = self.config.coerce_numeric(df_voter)
        df_voter = self.config.coerce_strings(
            df_voter, exclude=[self.config["voter_id"]])

        df_voter = df_voter.set_index(self.config["voter_id"]).join(df_hist)

        self.meta = {
            "message": "arkansas_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_elections_dict),
            "array_decoding": json.dumps(sorted_elections),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8", index=True)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 28

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file, compression="unzip")

        voter_file = [
            n for n in new_files if "statewide" in n["name"].lower()
        ][0]
        hist_files = [n for n in new_files if "history" in n["name"].lower()]

        # --- handling voter history --- #

        election_col = self.config["election_columns"]
        elections = self.config["elections"]

        df_hist = []

        for file in hist_files:
            text = file["obj"].readline()
            file["obj"].seek(0)

            if b"\t" in text:
                df = pd.read_csv(file["obj"], sep="\t", dtype=str)
            elif b"," in text:
                df = pd.read_csv(file["obj"], sep=",", dtype=str)

            election_type = file["name"][:file["name"].find(" Vot")]

            if not election_type in elections:
                print(
                    "Warning:",
                    election_type,
                    "not in documentation. Some fields may be excluded.",
                )

            for var, names in election_col.items():
                for col in df.columns:
                    if col in names:
                        df = df.rename({col: var}, axis=1)
                if var not in df.columns:
                    df.loc[:, var] = "NP"

            df = df.loc[:, election_col.keys()]

            df.loc[:, "election_type"] = election_type

            df_hist.append(df)

        df_hist = (pd.concat(df_hist, ignore_index=True).dropna(
            how="any").applymap(lambda x: str(x).strip()))

        df_hist.loc[:, "all_history"] = (
            df_hist.loc[:, "election_type"].str.lower().str.extract(
                "(\d+\s+[g|p]\w+)",
                expand=False).str.split("\s").str.join("_"))
        df_hist.loc[:, "election_date"] = pd.to_datetime(
            df_hist.loc[:, "election_date"].replace(
                "NP", pd.NaT)).dt.strftime("%m/%d/%Y")

        election_dates_dict = (
            df_hist.groupby("all_history")["election_date"].first().to_dict())
        elections, counts = np.unique(df_hist.loc[:, "all_history"],
                                      return_counts=True)

        sorted_elections_dict = {
            str(k): {
                "index": i,
                "count": int(counts[i]),
                "date": election_dates_dict[k],
            }
            for i, k in enumerate(elections)
        }
        sorted_elections = list(sorted_elections_dict.keys())

        df_hist.loc[:, "sparse_history"] = df_hist.loc[:, "all_history"].map(
            lambda x: int(sorted_elections_dict[x]["index"]))

        voter_groups = df_hist.sort_values("election_type").groupby(
            self.config["voter_id"])

        all_history = voter_groups["all_history"].apply(list)
        sparse_history = voter_groups["sparse_history"].apply(list)
        votetype_history = (
            voter_groups["vote_method"].apply(list).rename("votetype_history"))
        party_history = (voter_groups[self.config["party_identifier"]].apply(
            list).rename("party_history"))
        precinct_history = (
            voter_groups["precinct"].apply(list).rename("precinct_history"))

        df_hist = pd.concat(
            [
                all_history,
                sparse_history,
                votetype_history,
                party_history,
                precinct_history,
            ],
            axis=1,
        )

        # --- handling voter file --- #

        df_voter = pd.read_csv(voter_file["obj"], dtype=str)

        df_voter = self.config.coerce_strings(
            df_voter, exclude=[self.config["voter_id"]])
        df_voter = self.config.coerce_numeric(
            df_voter,
            extra_cols=[
                "Zip (RA)",
                "Split",
                "Precinct",
                "ZIP (MA)",
                "House",
                "Senate",
            ],
        )
        df_voter = self.config.coerce_dates(df_voter)

        df_voter.loc[:, self.config["voter_id"]] = (
            df_voter.loc[:, self.config["voter_id"]].str.zfill(9).astype(str))
        df_voter = df_voter.set_index(self.config["voter_id"])

        df_voter = df_voter.join(df_hist)

        self.meta = {
            "message": "wyoming_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_elections_dict),
            "array_decoding": json.dumps(sorted_elections),
        }

        self.is_compressed = False

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8", index=True)),
            s3_bucket=self.s3_bucket,
        )

Exemplo n.º 29

0

Exibir arquivo

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        def format_birthdays_differently_per_county(df):
            field = self.config["birthday_identifier"]
            df[field] = df[field].apply(str)
            for format_str in self.config["date_format"]:
                formatted = pd.to_datetime(df[field],
                                           format=format_str,
                                           errors="coerce")
                if len(formatted[~formatted.isna()]) > (0.5 * len(formatted)):
                    df[field] = formatted
                    break
            return df

        def combine_dfs(filelist):
            df = pd.DataFrame()
            for f in filelist:
                logging.info("Reading file: {}".format(f["name"]))
                new_df = self.read_csv_count_error_lines(f["obj"],
                                                         error_bad_lines=False)
                if "vlist" in f["name"]:
                    new_df = format_birthdays_differently_per_county(new_df)
                df = pd.concat([df, new_df], axis=0)
            return df

        def simplify_status(status):
            basic_status = ["Active", "Inactive", "Pending"]
            if type(status) is str:
                for s in basic_status:
                    if s in status:
                        return s
            return np.nan

        def insert_code_bin(arr):
            if isinstance(arr, list):
                return [sorted_codes_dict[k]["index"] for k in arr]
            else:
                return np.nan

        def handle_date(d):
            possible_date = date_from_str(d)
            if possible_date is None:
                return ""
            return pd.to_datetime(possible_date).strftime("%m/%d/%Y")

        new_files = self.unpack_files(file_obj=self.main_file,
                                      compression="infer")
        voter_files = [n for n in new_files if "vlist" in n["name"].lower()]
        hist_files = [n for n in new_files if "ehist" in n["name"].lower()]
        del self.main_file, self.temp_files
        gc.collect()

        if not self.ignore_checks:
            self.file_check(len(voter_files), len(hist_files))
        voter_df = combine_dfs(voter_files)
        hist_df = combine_dfs(hist_files)
        del voter_files, hist_files, new_files
        gc.collect()

        voter_df = self.config.coerce_strings(voter_df)

        if "displayId" in voter_df.columns:
            voter_df.rename(columns={"displayId": self.config["voter_id"]},
                            inplace=True)
        voter_df[self.config["voter_id"]] = voter_df[
            self.config["voter_id"]].str.upper()
        voter_df[self.config["party_identifier"]] = voter_df[
            self.config["party_identifier"]].str.replace(".", "")
        voter_df = self.config.coerce_numeric(
            voter_df,
            extra_cols=[
                "apt_unit",
                "ward",
                "district",
                "congressional",
                "legislative",
                "freeholder",
                "school",
                "fire",
            ],
        )

        # ensure district fields are e.g. "1" not "1.0"
        voter_df["congressional"] = (
            voter_df["congressional"].map(ensure_int_string))
        voter_df["legislative"] = (
            voter_df["legislative"].map(ensure_int_string))
        voter_df["district"] = (voter_df["district"].map(ensure_int_string))

        # multiple active / inactive statuses are incompatible with our data
        # model; simplify them while also keeping the original data
        voter_df["unabridged_status"] = voter_df[self.config["voter_status"]]
        voter_df[self.config["voter_status"]] = voter_df[
            self.config["voter_status"]].map(simplify_status)

        # handle history:
        hist_df["election_name"] = (hist_df["election_date"] + "_" +
                                    hist_df["election_name"])

        hist_df.dropna(subset=["election_name"], inplace=True)
        sorted_codes = sorted(hist_df["election_name"].unique().tolist())
        counts = hist_df["election_name"].value_counts()
        sorted_codes_dict = {
            k: {
                "index": int(i),
                "count": int(counts[k]),
                "date": handle_date(k),
            }
            for i, k in enumerate(sorted_codes)
        }

        hist_df.sort_values("election_name", inplace=True)
        hist_df.rename(columns={"voter_id": self.config["voter_id"]},
                       inplace=True)

        voter_df.set_index(self.config["voter_id"], drop=False, inplace=True)
        voter_groups = hist_df.groupby(self.config["voter_id"])

        # get extra data from history file that is missing from voter file
        voter_df["gender"] = voter_groups["voter_sex"].apply(
            lambda x: list(x)[-1])

        # at some point in in late 2020-early 2021 NJ started adding a reg_date
        # column and deprecating the registration_date information in the
        # voter_history file
        if "reg_date" in voter_df.columns:
            voter_df.rename(columns={"reg_date": "registration_date"},
                            inplace=True)
            # remove the UTC, does not fail if not utc
            voter_df["registration_date"] = pd.to_datetime(
                voter_df.registration_date,
                errors="coerce").dt.tz_localize(None)
        else:
            voter_df["registration_date"] = voter_groups[
                "voter_registrationDate"].apply(lambda x: list(x)[-1])

        self.column_check(list(voter_df.columns))

        voter_df = self.config.coerce_dates(voter_df)

        voter_df["all_history"] = voter_groups["election_name"].apply(list)
        voter_df["sparse_history"] = voter_df["all_history"].map(
            insert_code_bin)
        voter_df["party_history"] = voter_groups["voter_party"].apply(list)
        voter_df["votetype_history"] = voter_groups["ballot_type"].apply(list)
        del hist_df, voter_groups
        gc.collect()

        expected_cols = (self.config["ordered_columns"] +
                         self.config["ordered_generated_columns"])
        voter_df = self.reconcile_columns(voter_df, expected_cols)
        voter_df = voter_df[expected_cols]

        self.meta = {
            "message": "new_jersey2_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        csv_obj = voter_df.to_csv(encoding="utf-8", index=False)
        del voter_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()

Exemplo n.º 30

0

Exibir arquivo

Arquivo: oklahoma_preprocessor.py Projeto: Voteshield/reggie

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file)
        precincts_file = [
            x for x in new_files if 'precincts' in x["name"].lower()
        ][0]
        if precincts_file is None:
            raise ValueError("Missing Precincts File")
        voter_files = list(
            filter(lambda v: re.search('cty[0-9]+_vr.csv', v["name"].lower()),
                   new_files))
        self.file_check(len(voter_files) + 1)
        hist_files = list(
            filter(lambda v: re.search('cty[0-9]+_vh.csv', v["name"].lower()),
                   new_files))
        vdf = pd.DataFrame()
        hdf = pd.DataFrame()
        dtypes = self.config['dtypes']
        cty_map = dict([(value, key)
                        for key, value in self.config['county_codes'].items()])

        # Returns the string county name for the county code contained in the first two characters of the precicnct string
        def county_map(pct):
            def mapping(prec):
                county = cty_map[prec[:2]]
                return county

            return pd.Series(map(mapping, pct.tolist()))

        for file in voter_files:
            if "vr.csv" in file["name"].lower():
                temp_vdf = pd.read_csv(file["obj"],
                                       encoding='latin',
                                       dtype=dtypes)
                vdf = pd.concat([vdf, temp_vdf], ignore_index=True)
        vdf.drop_duplicates(inplace=True)

        # Read and merge the precincts file to the main df
        precinct_dtypes = {
            'PrecinctCode': 'string',
            'CongressionalDistrict': 'int64',
            'StateSenateDistrict': 'int64',
            'StateHouseDistrict': 'int64',
            'CountyCommissioner': 'int64',
            'PollSite': 'string'
        }
        precincts = pd.read_csv(precincts_file["obj"],
                                encoding='latin',
                                dtype=precinct_dtypes)
        precincts.rename(columns={"PrecinctCode": "Precinct"}, inplace=True)
        if precincts.empty:
            raise ValueError("Missing Precicnts file")
        vdf = vdf.merge(precincts, how='left', on='Precinct')

        # Add the county column
        vdf['County'] = county_map(vdf['Precinct'])

        # At one point OK added some columns, this adds them to older files for backwards compatibility
        self.reconcile_columns(vdf, self.config["columns"])
        for file in hist_files:
            temp_hdf = pd.read_csv(file["obj"], dtype={'VoterID': 'string'})
            hdf = pd.concat(
                [hdf, temp_hdf],
                ignore_index=True,
            )

        valid_elections, counts = np.unique(hdf["ElectionDate"],
                                            return_counts=True)
        count_order = counts.argsort()[::-1]
        valid_elections = valid_elections[count_order]
        counts = counts[count_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }
        hdf["array_position"] = hdf["ElectionDate"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))

        # The hist columns in the vdf are unecessary because we get a separate hist file that is more complete.
        hist_columns = [
            col for col in vdf.columns
            if "voterhist" in col.lower() or "histmethod" in col.lower()
        ]
        vdf = self.config.coerce_numeric(vdf)
        vdf = self.config.coerce_strings(vdf)
        vdf = self.config.coerce_dates(vdf)
        vdf.drop(hist_columns, inplace=True)
        vdf.set_index(self.config["voter_id"], drop=False, inplace=True)
        voter_groups = hdf.groupby(self.config["voter_id"])
        vdf["all_history"] = voter_groups["ElectionDate"].apply(list)
        vdf["sparse_history"] = voter_groups["array_position"].apply(list)
        vdf["votetype_history"] = voter_groups["VotingMethod"].apply(list)

        self.meta = {
            "message": "oklahoma_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(vdf.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )