Exemplo n.º 1
0
def state_download(state, s3_bucket):
    config_file = Config.config_file_from_state(state=state)
    configs = Config(file_name=config_file)

    if state == "north_carolina":
        today = nc_date_grab()
        list_files = configs['data_chunk_links']
        zipped_files = []
        for i, url in enumerate(list_files):
            target_path = "/tmp/" + state + str(i) + ".zip"
            zipped_files.append(target_path)
            response = requests.get(url, stream=True)
            handle = open(target_path, "wb")
            for chunk in response.iter_content(chunk_size=512):
                if chunk:
                    handle.write(chunk)
            handle.close()
        file_to_zip = today + ".zip"
        with zipfile.ZipFile(file_to_zip, 'w') as myzip:
            for f in zipped_files:
                myzip.write(f)
        file_to_zip = FileItem(
            "NC file auto download",
            filename=file_to_zip,
            s3_bucket=s3_bucket)
        loader = Loader(config_file=config_file, force_date=today,
                        s3_bucket=s3_bucket)
        loader.s3_dump(file_to_zip, file_class=RAW_FILE_PREFIX)

    elif state == "ohio":
        today = str(ohio_get_last_updated().isoformat())[0:10]
        list_files = configs['data_chunk_links']
        file_names = configs['data_file_names']
        zipped_files = []
        for i, url in enumerate(list_files):
            logging.info("downloading {} file".format(url))
            target_path = "/tmp/" + state + "_" + file_names[i] + ".txt.gz"
            zipped_files.append(target_path)
            response = requests.get(url, stream=True, verify=False)
            handle = open(target_path, "wb")
            for chunk in response.iter_content(chunk_size=512):
                if chunk:
                    handle.write(chunk)
            handle.close()
            logging.info("downloaded {} file".format(url))
        file_to_zip = today + ".zip"
        logging.info("Zipping files")
        with zipfile.ZipFile(file_to_zip, 'w') as myzip:
            for f in zipped_files:
                myzip.write(f)
        logging.info("Uploading")
        file_to_zip = FileItem(
            "OH file auto download",
            filename=file_to_zip,
            s3_bucket=s3_bucket)
        loader = Loader(config_file=config_file, force_date=today,
                        s3_bucket=s3_bucket)
        loader.s3_dump(file_to_zip, file_class=RAW_FILE_PREFIX)
Exemplo n.º 2
0
    def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs):

        if force_date is None:
            force_date = date_from_str(raw_s3_file)

        super().__init__(raw_s3_file=raw_s3_file,
                         config_file=config_file,
                         force_date=force_date,
                         **kwargs)
        self.raw_s3_file = raw_s3_file
        self.processed_file = None
        self.config = Config(file_name=config_file)
Exemplo n.º 3
0
def convert_voter_file(state=None,
                       local_file=None,
                       file_date=None,
                       write_file=False):
    config_file = Config.config_file_from_state(state)
    file_date = str(datetime.datetime.strptime(file_date, '%Y-%m-%d').date())
    with Preprocessor(None,
                      config_file,
                      force_file=local_file,
                      force_date=file_date) as preprocessor:
        file_item = preprocessor.execute()
        if not write_file:
            return (preprocessor.output_dataframe(file_item),
                    preprocessor.meta)
        preprocessor.local_dump(file_item)
Exemplo n.º 4
0
    def __init__(self,
                 raw_s3_file,
                 config_file,
                 force_date=None,
                 force_file=None,
                 testing=False,
                 ignore_checks=False,
                 s3_bucket="",
                 **kwargs):

        # Init change begin (adding loader object)
        self.config_file_path = config_file
        self.config = Config(file_name=config_file)
        self.chunk_urls = (self.config[CONFIG_CHUNK_URLS]
                           if CONFIG_CHUNK_URLS in self.config else [])
        if "tmp" not in os.listdir("/"):
            os.system("mkdir /tmp")
        self.file_type = self.config["file_type"]
        self.source = self.config["source"]
        self.is_compressed = False
        self.checksum = None
        self.state = self.config["state"]
        self.meta = None
        self.testing = testing
        self.ignore_checks = ignore_checks
        self.s3_bucket = s3_bucket
        if force_date is not None:
            self.download_date = parser.parse(force_date).isoformat()
        else:
            self.download_date = datetime.now().isoformat()
        if force_file is not None:
            working_file = "/tmp/voteshield_{}.tmp".format(uuid.uuid4())
            logging.info("copying {} to {}".format(force_file, working_file))
            shutil.copy2(force_file, working_file)
            self.main_file = FileItem("loader_force_file",
                                      filename=working_file,
                                      s3_bucket=self.s3_bucket)
        else:
            self.main_file = "/tmp/voteshield_{}.tmp".format(uuid.uuid4())

        self.temp_files = [self.main_file]

        # Init change end
        if force_date is None:
            force_date = date_from_str(raw_s3_file)

        self.raw_s3_file = raw_s3_file
Exemplo n.º 5
0
def convert_voter_file(state=None,
                       local_file=None,
                       file_date=None,
                       write_file=False):
    """Main Reggie function; processes a voter file, which is often more than one file, so will likely be a compressed file such as a .zip file.

    Parameters
    ----------
    state : string, optional
        State identifier which is the lower case version of the state name with underscores replacing spaces, by default None
    local_file : string, optional
        Path to file to process, by default None
    file_date : string, optional
        The snapshot date in format "YYYY-MM-DD", by default None
    write_file : bool, optional
        Whether to write the file out into a CSV file, which will be automatically named and write to the local directory, by default False

    Returns
    -------
    tuple
        If `write_file` is falsey, this function will return a tuple with the following objects:
            - The processed voter file as a CSV string
            - The meta data object
            - The preprocessor object
    """
    config_file = Config.config_file_from_state(state)
    file_date = str(datetime.datetime.strptime(file_date, "%Y-%m-%d").date())
    preprocessor = state_router(
        state,
        raw_s3_file=None,
        config_file=config_file,
        force_file=local_file,
        force_date=file_date,
    )
    preprocessor.execute()
    if not write_file:
        return (
            preprocessor.output_dataframe(preprocessor.processed_file),
            preprocessor.meta,
            preprocessor,
        )
    preprocessor.local_dump(preprocessor.processed_file)
Exemplo n.º 6
0
def get_processed_s3_uploads(state, s3_bucket, testing=False):
    configs = Config(state=state)
    keys = get_s3_uploads(configs["state"], configs["file_class"],
                          configs["source"], s3_bucket, testing)
    return keys
Exemplo n.º 7
0
    def execute(self):
        def district_fun(df_dist, df_voter, dist_dict):
            for dist_code in dist_dict.keys():
                temp_df = df_dist[df_dist["DistrictTypeCode"] == dist_code]
                temp_df = temp_df.rename(
                    columns={"DistrictName": dist_dict[dist_code]})
                df_voter = pd.merge(
                    df_voter,
                    temp_df[["PrecinctId", dist_dict[dist_code]]],
                    how="left",
                    on="PrecinctId",
                )
            df_voter.drop(columns=["PrecinctId"], inplace=True)
            return df_voter

        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        config = Config(file_name=self.config_file)
        new_files = self.unpack_files(file_obj=self.main_file)

        del self.main_file, self.temp_files
        gc.collect()

        # Have to use longer whole string not just suffix because hist will
        # match to voter file
        voter_file = [f for f in new_files if "pvrdr-vrd" in f["name"]][0]
        district_file = [f for f in new_files if "pvrdr-pd" in f["name"]][0]
        history_file = [f for f in new_files if "pvrdr-vph" in f["name"]][0]

        temp_voter_id_df = pd.read_csv(
            voter_file["obj"],
            sep="\t",
            encoding="latin-1",
            usecols=["RegistrantID"],
            dtype=str,
        )
        # rewind
        voter_file["obj"].seek(0)

        voter_ids = temp_voter_id_df["RegistrantID"].unique().tolist()

        del temp_voter_id_df
        gc.collect()

        hist_dict = {i: np.nan for i in voter_ids}
        votetype_dict = {i: np.nan for i in voter_ids}
        del voter_ids
        gc.collect()

        # key election, values date and count, then sort.
        # gonna have to iterate over all_hist and map to sparse
        elect_dict = defaultdict(int)

        def dict_cols(chunk,
                      history_dict=None,
                      votetype_dict=None,
                      election_dict=None):
            chunk["combined_col"] = (
                chunk["ElectionType"].replace(" ", "") + "_" +
                chunk["ElectionDate"]
                # + "_"
                # + chunk["Method"]
            )
            chunk["election"] = (chunk["ElectionType"].replace(" ", "") + "_" +
                                 chunk["ElectionDate"])
            chunk.drop(
                columns=[
                    "ElectionType",
                    "ElectionName",
                    "ElectionDate",
                    "CountyCode",
                ],
                inplace=True,
            )
            for row in chunk.itertuples():
                try:
                    current_li = hist_dict[row.RegistrantID]
                    votetype_hist = votetype_dict[row.RegistrantID]
                    # throws key error for entries not in voter file
                    election_dict[row.election] += 1
                    combined_row = row.combined_col
                    if isinstance(current_li, list):
                        current_li.append(combined_row)
                        votetype_hist.append(row.Method)
                        history_dict[row.RegistrantID] = current_li
                        votetype_dict[row.RegistrantID] = votetype_hist
                    else:
                        # test_dict[row['RegistrantID']][0]
                        history_dict[row.RegistrantID] = [
                            combined_row
                        ]  # Create list of elections even if len 1
                        votetype_dict[row.RegistrantID] = [row.Method]
                except KeyError:
                    continue

        # Chunk size, over ~3 mil of so leads to slowdown
        chunk_size = 3000000

        history_chunks = pd.read_csv(
            history_file["obj"],
            sep="\t",
            usecols=[
                "RegistrantID",
                "CountyCode",
                "ElectionDate",
                "ElectionName",
                "ElectionType",
                "Method",
            ],
            dtype=str,
            chunksize=chunk_size,
        )
        for chunk in history_chunks:
            dict_cols(chunk, hist_dict, votetype_dict, elect_dict)

        history_file["obj"].close()
        del history_file
        gc.collect()

        hist_series = pd.Series(hist_dict, name="all_history")

        del hist_dict
        gc.collect()

        votetype_series = pd.Series(votetype_dict, name="votetype_history")

        del votetype_dict
        gc.collect()

        logging.info("reading in CA voter df")

        category_list = [
            "CountyCode",
            "Suffix",
            "StreetDirPrefix",
            "AddressNumberSuffix",
            "StreetType",
            "StreetDirSuffix",
            "UnitType",
            "City",
            "State",
            "Zip",
            "Language",
            "Gender",
            "PartyCode",
            "Status",
            "VoterStatusReasonCodeDesc",
            "AssistanceRequestFlag",
            "VbmVoterType",
            "USCongressionalDistrict",
            "StateSenate",
            "Municipality",
            "StateAddr",
        ]
        # read in columns to set dtype as pyarrow
        col_ifornia = pd.read_csv(voter_file["obj"],
                                  sep="\t",
                                  nrows=0,
                                  encoding="latin-1").columns.tolist()

        voter_file["obj"].seek(0)
        dtype_dict = {
            col:
            ("string[pyarrow]" if col not in category_list else "category")
            for col in col_ifornia
        }
        voter_df = pd.read_csv(
            voter_file["obj"],
            sep="\t",
            dtype=dtype_dict,
            encoding="latin-1",
            on_bad_lines="warn",
        )

        # Replaces the state column name in the address fields with StateAddr to avoid duplicate column names
        voter_df.rename(columns={"State": "StateAddr"}, inplace=True)

        logging.info("dataframe memory usage: {}".format(
            round((voter_df.memory_usage(deep=True).sum() / 1024**2), 2)))

        voter_file["obj"].close()
        del voter_file
        gc.collect()

        district_dict = {
            "CG": "USCongressionalDistrict",
            "SS": "StateSenate",
            "SA": "StateAssembly",
            "CI": "Municipality",
            "SU": "CountySupervisoral",
        }
        district_df = pd.read_csv(district_file["obj"],
                                  sep="\t",
                                  dtype="string[pyarrow]")

        district_file["obj"].close()
        del district_file

        merged_districts = district_fun(
            district_df,
            voter_df[["RegistrantID", "PrecinctId"]],
            district_dict,
        )

        voter_df = voter_df.merge(merged_districts,
                                  left_on="RegistrantID",
                                  right_on="RegistrantID")

        del merged_districts
        gc.collect()

        voter_df.set_index("RegistrantID", inplace=True)

        voter_df = voter_df.merge(hist_series,
                                  left_index=True,
                                  right_index=True)

        del hist_series
        gc.collect()

        voter_df = voter_df.merge(votetype_series,
                                  left_index=True,
                                  right_index=True)

        del votetype_series
        gc.collect()

        # create sparse history
        sorted_keys = sorted(elect_dict.items(),
                             key=lambda x: x[0].split("_")[1])
        sorted_codes_dict = {
            value[0]: {
                "index": i,
                "count": value[1]
            }
            for i, value in enumerate(sorted_keys)
        }
        sorted_codes = [x[0] for x in sorted_keys]
        voter_df["sparse_history"] = voter_df.all_history.apply(
            lambda x: [sorted_codes_dict[y]["index"] for y in x]
            if x == x else np.nan)
        # Begin Coerce

        # categories to turn them in to strings
        logging.info("coecrcing strings")
        voter_df = self.coerce_strings(voter_df, config, category_list)

        logging.info("coecrcing dates")
        voter_df = self.config.coerce_dates(voter_df)

        logging.info("coecrcing numeric")
        voter_df = self.config.coerce_numeric(voter_df)

        voter_df = voter_df.reset_index().rename(
            columns={"index": "RegistrantID"})

        voter_csv = voter_df.to_csv(encoding="utf-8", index=False)

        del voter_df
        gc.collect()

        self.meta = {
            "message": "california_{}".format(datetime.now().isoformat()),
            "array_encoding": sorted_codes_dict,
            "array_decoding": sorted_codes,
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(voter_csv),
            s3_bucket=self.s3_bucket,
        )
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        config = Config(file_name=self.config_file)
        new_files = self.unpack_files(file_obj=self.main_file)
        del self.main_file, self.temp_files
        gc.collect()

        voter_files = [f for f in new_files if "FVE" in f["name"]]
        election_maps = [f for f in new_files if "Election Map" in f["name"]]
        zone_codes = [f for f in new_files if "Codes" in f["name"]]
        zone_types = [f for f in new_files if "Types" in f["name"]]
        del new_files
        gc.collect()

        if not self.ignore_checks:
            # election maps need to line up to voter files?
            self.file_check(len(voter_files), len(election_maps))
        counties = config["county_names"]
        main_df = None
        # Preserving the order of the file sent, but concatinating the district and election columns which were
        # dropped in the legacy processed file
        dfcols = (config["ordered_columns"][:-3] + config["district_columns"] +
                  config["election_columns"] + config["ordered_columns"][-3:])

        # create a mapping that returns a series based on the values across rows (voters) of cells (election info).
        # consolidates the non nan values into one string that can be appended as a column later for the all_history and
        # the districts columns
        def list_map(df_sub, columns, zone_dict=None):
            def mapping(li, zone_dict=zone_dict):
                if zone_dict is None:
                    li = [x for x in li if x != "nan"]
                    return li
                else:
                    li = [
                        zone_dict[x] for x in li
                        if x != "nan" and x in zone_dict
                    ]
                    return li

            return pd.Series(
                map(mapping, df_sub[columns].values.astype(str).tolist()))

        sorted_codes = []
        sorted_code_dict = defaultdict(defaultdict)
        dtypes = {col: "str" for col in dfcols}
        for idx, c in enumerate(counties):
            logging.info("Processing {} {}/{}".format(c, idx, len(counties)))
            c = format_column_name(c)
            try:
                voter_file = next(f for f in voter_files
                                  if c in f["name"].lower())
                election_map = next(f for f in election_maps
                                    if c in f["name"].lower())
                zones = next(f for f in zone_codes if c in f["name"].lower())
                types = next(f for f in zone_types if c in f["name"].lower())
            except StopIteration:
                continue
            df = self.read_csv_count_error_lines(
                voter_file["obj"],
                sep="\t",
                names=dfcols,
                error_bad_lines=False,
                dtype=dtypes,
            )
            edf = self.read_csv_count_error_lines(
                election_map["obj"],
                sep="\t",
                names=["county", "number", "title", "date"],
                error_bad_lines=False,
                dtype={
                    "county": str,
                    "number": str,
                    "title": str,
                    "date": str,
                },
            )
            zdf = self.read_csv_count_error_lines(
                zones["obj"],
                sep="\t",
                names=[
                    "county_name",
                    "zone_number",
                    "zone_code",
                    "zone_description",
                ],
                error_bad_lines=False,
            )
            tdf = self.read_csv_count_error_lines(
                types["obj"],
                sep="\t",
                names=[
                    "county_name",
                    "zone_number",
                    "zone_short_name",
                    "zone_long_name",
                ],
                error_bad_lines=False,
            )

            # Refactor note: format the election data into the format expected in the original all_history column
            edf["election_list"] = edf["title"] + " " + edf["date"]

            # Gather the positional vote and distict columns
            district_columns = df.columns[30:70].to_list()
            vote_columns = df.columns[70:150].to_list()

            # create a dict of the formatted election data using the index number in the given file, this
            # corresponds to the column index beginning at the start of the vote columns in the dataframe
            # Index begins starting at 1
            election_map = pd.Series(edf.election_list.values,
                                     index=edf.number).to_dict()

            # merge the zone files together to consolidate the information in one dataframe
            zdf = zdf.merge(tdf, how="left", on="zone_number")
            # format a column field that contains the zone description and the name so
            # that it matches the current district field
            zdf["combined"] = (zdf["zone_description"] + " Type: " +
                               zdf["zone_long_name"])

            # create a dict that utilizes the zone code as the key and the long name string as the value
            zone_dict = dict(zip(zdf.zone_code.astype(str), zdf.combined))

            # Gather the pairs of election columns to iterate over both at the same time to collect the information
            # contained in both of the columns per election
            vote_column_list = list(
                zip(df.columns[70:150:2], df.columns[71:150:2]))

            # get the value from the eleciton map key for the election name,
            # then combine it with the value in the party and vote type cells for the full election information
            # Creates a history dataframe containing, as cells, the election name as gathered in the election file, the
            # vote type (AP, A etc), and the party information, all separated by spaces
            # The columns are all named election_#_vote_type but the cells contain the relevant information
            vote_hist_df = pd.DataFrame({
                i: election_map[i.split("_")[1]] + " " + df[i] + " " + df[j]
                for i, j in vote_column_list if i.split("_")[1] in election_map
            })

            # counts for the metadata
            counts = vote_hist_df.count()
            for i in counts.index:
                current_key = election_map[i.split("_")[1]]

                # Metadata needs to be _ separated not space
                current_key = "_".join(current_key.split())
                if current_key in sorted_code_dict:
                    sorted_code_dict[current_key]["count"] += int(counts[i])
                else:
                    current_date = edf.loc[edf["number"] == i.split("_")
                                           [1]]["date"].values[0]
                    new_dict_entry = defaultdict(str)
                    new_dict_entry["date"] = current_date
                    new_dict_entry["count"] = int(counts[i])
                    sorted_code_dict[current_key] = new_dict_entry
            # converts the dataframe to a series that contains the list of elections participate in indexed on position
            vote_hist_df = list_map(vote_hist_df, vote_hist_df.columns)
            districts = list_map(df[district_columns], district_columns,
                                 zone_dict)

            df["all_history"] = vote_hist_df
            df["districts"] = districts
            df.drop(vote_columns, axis=1, inplace=True)
            df.drop(district_columns, axis=1, inplace=True)

            cols_to_check = [
                col for col in list(df.columns)
                if col not in vote_columns and col not in district_columns
            ]

            self.column_check(list(df.columns), cols_to_check)
            if main_df is None:
                main_df = df
            else:
                main_df = pd.concat([main_df, df], ignore_index=True)

        del voter_files, election_maps, zone_codes, zone_types
        gc.collect()

        sorted_keys = sorted(sorted_code_dict.items(),
                             key=lambda x: parser.parse(x[1]["date"]))
        for index, key in enumerate(sorted_keys):
            sorted_code_dict[key[0]]["index"] = index
            sorted_codes.append(key[0])
        del sorted_keys
        gc.collect()

        logging.info("coercing")
        main_df = config.coerce_dates(main_df)
        main_df = config.coerce_numeric(
            main_df,
            extra_cols=[
                "house_number",
                "apartment_number",
                "address_line_2",
                "zip",
                "mail_address_1",
                "mail_address_2",
                "mail_zip",
                "precinct_code",
                "precinct_split_id",
                "legacy_id",
                "home_phone",
            ],
        )
        logging.info("Writing CSV")
        self.meta = {
            "message": "pennsylvania_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_code_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        csv_obj = main_df.to_csv(encoding="utf-8", index=False)
        del main_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()
Exemplo n.º 9
0
class PreprocessNorthCarolina(Preprocessor):
    def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs):

        if force_date is None:
            force_date = date_from_str(raw_s3_file)

        super().__init__(raw_s3_file=raw_s3_file,
                         config_file=config_file,
                         force_date=force_date,
                         **kwargs)
        self.raw_s3_file = raw_s3_file
        self.processed_file = None
        self.config = Config(file_name=config_file)

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(
            file_obj=self.main_file)  # array of dicts
        del self.main_file, self.temp_files
        gc.collect()

        if not self.ignore_checks:
            self.file_check(len(new_files))

        for i in new_files:
            if ("ncvhis" in i["name"]) and (".txt" in i["name"]):
                vote_hist_file = i
            elif ("ncvoter" in i["name"]) and (".txt" in i["name"]):
                voter_file = i
        voter_df = self.read_csv_count_error_lines(
            voter_file["obj"],
            sep="\t",
            quotechar='"',
            encoding="latin-1",
            error_bad_lines=False,
        )
        del voter_file
        gc.collect()

        vote_hist = self.read_csv_count_error_lines(
            vote_hist_file["obj"],
            sep="\t",
            quotechar='"',
            error_bad_lines=False,
        )
        del vote_hist_file, new_files
        gc.collect()

        try:
            voter_df.columns = self.config["ordered_columns"]
        except ValueError:
            logging.info(
                "Incorrect number of columns found for the voter file in North Carolina"
            )
            raise MissingNumColumnsError(
                "{} state is missing columns".format(self.state),
                self.state,
                len(self.config["ordered_columns"]),
                len(voter_df.columns),
            )
        try:
            vote_hist.columns = self.config["hist_columns"]
        except ValueError:
            logging.info(
                "Incorrect number of columns found for the history file in North Carolina"
            )
            raise

        valid_elections, counts = np.unique(vote_hist["election_desc"],
                                            return_counts=True)
        count_order = counts.argsort()[::-1]
        valid_elections = valid_elections[count_order]
        counts = counts[count_order]

        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }
        vote_hist["array_position"] = vote_hist["election_desc"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))
        del valid_elections, counts, count_order
        gc.collect()

        voter_groups = vote_hist.groupby(self.config["voter_id"])
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["voting_method"].apply(list)

        voter_df = voter_df.set_index(self.config["voter_id"])

        voter_df["all_history"] = all_history
        voter_df["vote_type"] = vote_type
        del voter_groups, vote_hist, all_history, vote_type
        gc.collect()

        voter_df = self.config.coerce_strings(voter_df)
        voter_df = self.config.coerce_dates(voter_df)
        voter_df = self.config.coerce_numeric(
            voter_df,
            extra_cols=[
                "county_commiss_abbrv",
                "fire_dist_abbrv",
                "full_phone_number",
                "judic_dist_abbrv",
                "munic_dist_abbrv",
                "municipality_abbrv",
                "precinct_abbrv",
                "precinct_desc",
                "school_dist_abbrv",
                "super_court_abbrv",
                "township_abbrv",
                "township_desc",
                "vtd_abbrv",
                "vtd_desc",
                "ward_abbrv",
            ],
        )

        self.meta = {
            "message": "north_carolina_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        self.is_compressed = False

        csv_obj = voter_df.to_csv(encoding="utf-8", index=True)
        del voter_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()