def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        config = Config(file_name=self.config_file)
        new_files = self.unpack_files(file_obj=self.main_file)
        del self.main_file, self.temp_files
        gc.collect()

        voter_files = [f for f in new_files if "FVE" in f["name"]]
        election_maps = [f for f in new_files if "Election Map" in f["name"]]
        zone_codes = [f for f in new_files if "Codes" in f["name"]]
        zone_types = [f for f in new_files if "Types" in f["name"]]
        del new_files
        gc.collect()

        if not self.ignore_checks:
            # election maps need to line up to voter files?
            self.file_check(len(voter_files), len(election_maps))
        counties = config["county_names"]
        main_df = None
        # Preserving the order of the file sent, but concatinating the district and election columns which were
        # dropped in the legacy processed file
        dfcols = (config["ordered_columns"][:-3] + config["district_columns"] +
                  config["election_columns"] + config["ordered_columns"][-3:])

        # create a mapping that returns a series based on the values across rows (voters) of cells (election info).
        # consolidates the non nan values into one string that can be appended as a column later for the all_history and
        # the districts columns
        def list_map(df_sub, columns, zone_dict=None):
            def mapping(li, zone_dict=zone_dict):
                if zone_dict is None:
                    li = [x for x in li if x != "nan"]
                    return li
                else:
                    li = [
                        zone_dict[x] for x in li
                        if x != "nan" and x in zone_dict
                    ]
                    return li

            return pd.Series(
                map(mapping, df_sub[columns].values.astype(str).tolist()))

        sorted_codes = []
        sorted_code_dict = defaultdict(defaultdict)
        dtypes = {col: "str" for col in dfcols}
        for idx, c in enumerate(counties):
            logging.info("Processing {} {}/{}".format(c, idx, len(counties)))
            c = format_column_name(c)
            try:
                voter_file = next(f for f in voter_files
                                  if c in f["name"].lower())
                election_map = next(f for f in election_maps
                                    if c in f["name"].lower())
                zones = next(f for f in zone_codes if c in f["name"].lower())
                types = next(f for f in zone_types if c in f["name"].lower())
            except StopIteration:
                continue
            df = self.read_csv_count_error_lines(
                voter_file["obj"],
                sep="\t",
                names=dfcols,
                error_bad_lines=False,
                dtype=dtypes,
            )
            edf = self.read_csv_count_error_lines(
                election_map["obj"],
                sep="\t",
                names=["county", "number", "title", "date"],
                error_bad_lines=False,
                dtype={
                    "county": str,
                    "number": str,
                    "title": str,
                    "date": str,
                },
            )
            zdf = self.read_csv_count_error_lines(
                zones["obj"],
                sep="\t",
                names=[
                    "county_name",
                    "zone_number",
                    "zone_code",
                    "zone_description",
                ],
                error_bad_lines=False,
            )
            tdf = self.read_csv_count_error_lines(
                types["obj"],
                sep="\t",
                names=[
                    "county_name",
                    "zone_number",
                    "zone_short_name",
                    "zone_long_name",
                ],
                error_bad_lines=False,
            )

            # Refactor note: format the election data into the format expected in the original all_history column
            edf["election_list"] = edf["title"] + " " + edf["date"]

            # Gather the positional vote and distict columns
            district_columns = df.columns[30:70].to_list()
            vote_columns = df.columns[70:150].to_list()

            # create a dict of the formatted election data using the index number in the given file, this
            # corresponds to the column index beginning at the start of the vote columns in the dataframe
            # Index begins starting at 1
            election_map = pd.Series(edf.election_list.values,
                                     index=edf.number).to_dict()

            # merge the zone files together to consolidate the information in one dataframe
            zdf = zdf.merge(tdf, how="left", on="zone_number")
            # format a column field that contains the zone description and the name so
            # that it matches the current district field
            zdf["combined"] = (zdf["zone_description"] + " Type: " +
                               zdf["zone_long_name"])

            # create a dict that utilizes the zone code as the key and the long name string as the value
            zone_dict = dict(zip(zdf.zone_code.astype(str), zdf.combined))

            # Gather the pairs of election columns to iterate over both at the same time to collect the information
            # contained in both of the columns per election
            vote_column_list = list(
                zip(df.columns[70:150:2], df.columns[71:150:2]))

            # get the value from the eleciton map key for the election name,
            # then combine it with the value in the party and vote type cells for the full election information
            # Creates a history dataframe containing, as cells, the election name as gathered in the election file, the
            # vote type (AP, A etc), and the party information, all separated by spaces
            # The columns are all named election_#_vote_type but the cells contain the relevant information
            vote_hist_df = pd.DataFrame({
                i: election_map[i.split("_")[1]] + " " + df[i] + " " + df[j]
                for i, j in vote_column_list if i.split("_")[1] in election_map
            })

            # counts for the metadata
            counts = vote_hist_df.count()
            for i in counts.index:
                current_key = election_map[i.split("_")[1]]

                # Metadata needs to be _ separated not space
                current_key = "_".join(current_key.split())
                if current_key in sorted_code_dict:
                    sorted_code_dict[current_key]["count"] += int(counts[i])
                else:
                    current_date = edf.loc[edf["number"] == i.split("_")
                                           [1]]["date"].values[0]
                    new_dict_entry = defaultdict(str)
                    new_dict_entry["date"] = current_date
                    new_dict_entry["count"] = int(counts[i])
                    sorted_code_dict[current_key] = new_dict_entry
            # converts the dataframe to a series that contains the list of elections participate in indexed on position
            vote_hist_df = list_map(vote_hist_df, vote_hist_df.columns)
            districts = list_map(df[district_columns], district_columns,
                                 zone_dict)

            df["all_history"] = vote_hist_df
            df["districts"] = districts
            df.drop(vote_columns, axis=1, inplace=True)
            df.drop(district_columns, axis=1, inplace=True)

            cols_to_check = [
                col for col in list(df.columns)
                if col not in vote_columns and col not in district_columns
            ]

            self.column_check(list(df.columns), cols_to_check)
            if main_df is None:
                main_df = df
            else:
                main_df = pd.concat([main_df, df], ignore_index=True)

        del voter_files, election_maps, zone_codes, zone_types
        gc.collect()

        sorted_keys = sorted(sorted_code_dict.items(),
                             key=lambda x: parser.parse(x[1]["date"]))
        for index, key in enumerate(sorted_keys):
            sorted_code_dict[key[0]]["index"] = index
            sorted_codes.append(key[0])
        del sorted_keys
        gc.collect()

        logging.info("coercing")
        main_df = config.coerce_dates(main_df)
        main_df = config.coerce_numeric(
            main_df,
            extra_cols=[
                "house_number",
                "apartment_number",
                "address_line_2",
                "zip",
                "mail_address_1",
                "mail_address_2",
                "mail_zip",
                "precinct_code",
                "precinct_split_id",
                "legacy_id",
                "home_phone",
            ],
        )
        logging.info("Writing CSV")
        self.meta = {
            "message": "pennsylvania_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_code_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        csv_obj = main_df.to_csv(encoding="utf-8", index=False)
        del main_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()
Пример #2
0
class PreprocessNorthCarolina(Preprocessor):
    def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs):

        if force_date is None:
            force_date = date_from_str(raw_s3_file)

        super().__init__(raw_s3_file=raw_s3_file,
                         config_file=config_file,
                         force_date=force_date,
                         **kwargs)
        self.raw_s3_file = raw_s3_file
        self.processed_file = None
        self.config = Config(file_name=config_file)

    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(
            file_obj=self.main_file)  # array of dicts
        del self.main_file, self.temp_files
        gc.collect()

        if not self.ignore_checks:
            self.file_check(len(new_files))

        for i in new_files:
            if ("ncvhis" in i["name"]) and (".txt" in i["name"]):
                vote_hist_file = i
            elif ("ncvoter" in i["name"]) and (".txt" in i["name"]):
                voter_file = i
        voter_df = self.read_csv_count_error_lines(
            voter_file["obj"],
            sep="\t",
            quotechar='"',
            encoding="latin-1",
            error_bad_lines=False,
        )
        del voter_file
        gc.collect()

        vote_hist = self.read_csv_count_error_lines(
            vote_hist_file["obj"],
            sep="\t",
            quotechar='"',
            error_bad_lines=False,
        )
        del vote_hist_file, new_files
        gc.collect()

        try:
            voter_df.columns = self.config["ordered_columns"]
        except ValueError:
            logging.info(
                "Incorrect number of columns found for the voter file in North Carolina"
            )
            raise MissingNumColumnsError(
                "{} state is missing columns".format(self.state),
                self.state,
                len(self.config["ordered_columns"]),
                len(voter_df.columns),
            )
        try:
            vote_hist.columns = self.config["hist_columns"]
        except ValueError:
            logging.info(
                "Incorrect number of columns found for the history file in North Carolina"
            )
            raise

        valid_elections, counts = np.unique(vote_hist["election_desc"],
                                            return_counts=True)
        count_order = counts.argsort()[::-1]
        valid_elections = valid_elections[count_order]
        counts = counts[count_order]

        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }
        vote_hist["array_position"] = vote_hist["election_desc"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))
        del valid_elections, counts, count_order
        gc.collect()

        voter_groups = vote_hist.groupby(self.config["voter_id"])
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["voting_method"].apply(list)

        voter_df = voter_df.set_index(self.config["voter_id"])

        voter_df["all_history"] = all_history
        voter_df["vote_type"] = vote_type
        del voter_groups, vote_hist, all_history, vote_type
        gc.collect()

        voter_df = self.config.coerce_strings(voter_df)
        voter_df = self.config.coerce_dates(voter_df)
        voter_df = self.config.coerce_numeric(
            voter_df,
            extra_cols=[
                "county_commiss_abbrv",
                "fire_dist_abbrv",
                "full_phone_number",
                "judic_dist_abbrv",
                "munic_dist_abbrv",
                "municipality_abbrv",
                "precinct_abbrv",
                "precinct_desc",
                "school_dist_abbrv",
                "super_court_abbrv",
                "township_abbrv",
                "township_desc",
                "vtd_abbrv",
                "vtd_desc",
                "ward_abbrv",
            ],
        )

        self.meta = {
            "message": "north_carolina_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        self.is_compressed = False

        csv_obj = voter_df.to_csv(encoding="utf-8", index=True)
        del voter_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()