Exemplo n.º 1
0
    def get_direct(self):
        colleges = pd.read_csv(
            "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"
        )
        norm = e.usa_county_to_fips("state")
        norm.rewrite["baltimore"] = "baltimore city"
        norm.rewrite["st. louis"] = "st. louis city"
        norm.rewrite["new york city"] = "new york"
        norm.rewrite["franklin"] = "franklin city"
        norm.rewrite["richmond"] = "richmond city"
        norm.rewrite["fairfax"] = "fairfax city"
        norm.rewrite["roanoke"] = "roanoke city"
        norm.rewrite["st. thomas"] = "st. thomas island"
        norm.rewrite["doña ana"] = "dona ana"
        norm.rewrite["bayam_n"] = "bayamon"
        norm.rewrite["maoputasi"] = "ERROR"
        norm.rewrite["mangilao village"] = "ERROR"
        norm.rewrite["nan"] = "ERROR"
        norm.rewrite["joplin"] = "jasper"
        norm.rewrite["kansas city"] = "jackson"
        norm.rewrite["washington, d.c."] = "ERROR"
        norm.apply_to_df(colleges, "county", "county_fips", var_name="norm")

        colleges = e.remove_errors(colleges, "county_fips")
        agg = e.Aggregator(
            grouped_columns=["county_fips"],
            aggregation_functions={"college": lambda x: len(set(x))},
        )

        agg.removed_columns.append("cases")
        agg.removed_columns.append("cases_2021")
        agg.removed_columns.append("city")
        agg.removed_columns.append("college")
        agg.removed_columns.append("county")
        agg.removed_columns.append("ipeds_id")
        agg.removed_columns.append("notes")
        agg.removed_columns.append("state")
        agg.removed_columns.append("date")

        return agg(colleges, var_name="agg")
Exemplo n.º 2
0
    def get_direct(self):

        df = e.to_csv(
            e.download(
                "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
            ))
        df = df[df.COUNTY != 0].copy()
        counts = [
            "CENSUS2010POP",
            "ESTIMATESBASE2010",
            "POPESTIMATE2010",
            "POPESTIMATE2011",
            "POPESTIMATE2012",
            "POPESTIMATE2013",
            "POPESTIMATE2014",
            "POPESTIMATE2015",
            "POPESTIMATE2016",
            "POPESTIMATE2017",
            "POPESTIMATE2018",
            "POPESTIMATE2019",
        ]
        df = df[["STNAME", "CTYNAME", *counts]]
        normalizer = e.usa_county_to_fips("STNAME",
                                          alaska_handler=self.alaska_handler)
        normalizer.rewrite["doña ana county"] = "dona ana county"
        normalizer.apply_to_df(df, "CTYNAME", "FIPS")
        df.loc[df.FIPS == "02AL", "CTYNAME"] = "Alaska"
        df = e.Aggregator(grouped_columns=["FIPS"],
                          aggregation_functions={c: np.sum
                                                 for c in counts})(df)
        df = e.merge(
            by_source=dict(mainland=df, pr=self.puerto_rico()),
            join_columns=["FIPS"],
            ignore_duplication={},
            resolvers=[],
        )
        return df
Exemplo n.º 3
0
    def get_direct(self):

        # https://doi.org/10.7910/DVN/UYSHST
        bs = requests.get(
            "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/UYSHST/0UQP48"
        ).content
        zf = ZipFile(io.BytesIO(bs))
        with zf.open("national-files/us-house-wide.csv") as f:
            df_house = e.to_csv(f.read().decode("utf-8"))
        with zf.open("national-files/us-senate-wide.csv") as f:
            df_senate = e.to_csv(f.read().decode("utf-8"))
        with zf.open("national-files/governor-wide.csv") as f:
            df_governor = e.to_csv(f.read().decode("utf-8"))

        # Just manually fix the Angus King thing
        df_senate.loc[df_senate.state == "ME",
                      "dem"] += df_senate[df_senate.state == "ME"]["other"]
        df_senate.loc[df_senate.state == "ME", "other"] = 0

        df = pd.concat([df_house, df_senate, df_governor])

        df = pd.concat([
            self.normalize_named_counties(df[df.fipscode2 % 100000 == 0]),
            self.normalize_townships(df[df.fipscode2 % 100000 != 0]),
        ])

        df["special"] = df.office == "US Senate Special Election"

        df = df.rename(columns={
            "dem": "votes_DEM",
            "rep": "votes_GOP",
            "other": "votes_other"
        })

        office_normalizer = e.usa_office_normalizer()

        office_normalizer.apply_to_df(df, "office", "office")

        e.district_normalizer().apply_to_df(df, "district", "district")

        df.district = df.apply(
            lambda row: row.district
            if row.district == "statewide" or row.state != "WV" else {
                31: 3,
                21: 2,
                11: 1
            }[row.district],
            axis=1,
        )

        df = df[~((df.district == "statewide") & (df.office == "us house"))]

        agg = e.Aggregator(
            grouped_columns=["county_fips", "district", "office", "special"],
            aggregation_functions={
                "votes_DEM": sum,
                "votes_GOP": sum,
                "votes_other": sum,
            },
        )

        agg.removed_columns.append("county")

        df = agg(df)

        # Source: https://ballotpedia.org/Arizona%27s_5th_Congressional_District_election,_2018
        df.loc[(df.state == "AZ") & (df.district == 5), "votes_GOP"] = 186037

        # Source: https://www.nytimes.com/elections/results/connecticut-house-district-1
        df.loc[(df.state == "CT") & (df.district == 1) &
               (df.county_fips == "09003"),
               ["votes_DEM", "votes_GOP", "votes_other"], ] = [
                   159950, 81418, 2651
               ]
        df.loc[(df.state == "CT") & (df.district == 5) &
               (df.county_fips == "09003"),
               ["votes_DEM", "votes_GOP", "votes_other"], ] = [
                   41805, 26985, 0
               ]
        # Source: https://www.nytimes.com/elections/results/michigan-house-district-13
        df.loc[(df.state == "MI") & (df.district == 13),
               ["votes_DEM", "votes_GOP", "votes_other"], ] = [165355, 0, 0]

        # Extra row removal, this row should not be present
        df = df[~((df.county_fips == "24037") & (df.district == 1))]

        df = df.copy()

        # this row belongs to the wrong county
        # see https://www.nytimes.com/elections/results/maryland-house-district-5
        # Somerset is included instead of St Marys
        df.loc[(df.state == "MD") & (df.district == 5) &
               (df.county_fips == "24039"), "county_fips", ] = "24037"

        return df
    def get_direct(self):
        df_harvard = HarvardDataverse2018General(self.alaska_handler).get()

        df_mit = MITElectionLab2018General().get()
        df_mit = df_mit.copy()

        df_mit["state"] = df_mit["state_po"]

        df_mit = df_mit[[
            "state",
            "office",
            "district",
            "votes_DEM",
            "votes_GOP",
            "votes_other",
            "county_fips",
            "special",
        ]]

        df_mit = df_mit[df_mit.office.apply(
            lambda x: x in {"us house", "us senate", "us state governor"})]
        df_mit = df_mit[df_mit.district != "District 0"]

        e.district_normalizer().apply_to_df(df_mit, "district", "district")

        df = e.merge(
            by_source={
                "harvard": df_harvard,
                "mit": df_mit
            },
            join_columns=[
                "county_fips", "office", "district", "state", "special"
            ],
            ignore_duplication={"votes_other": np.mean},
            resolvers=self.resolvers(),
            checksum=e.Aggregator(
                grouped_columns=["district", "office", "state", "special"],
                aggregation_functions={
                    "votes_DEM": sum,
                    "votes_GOP": sum,
                    "votes_other": sum,
                },
                removed_columns=["county_fips"],
            ),
        )
        by_district = e.Aggregator(
            grouped_columns=["district", "office", "state", "special"],
            removed_columns=["county_fips"],
            aggregation_functions=dict(votes_other=sum,
                                       votes_DEM=sum,
                                       votes_GOP=sum),
        )(df)
        summary = HarvardDataverseCongressDistrict().get_direct()
        e.validate_same(
            by_district[by_district.office == "us house"],
            summary[(summary.year == 2018) & (summary.office == "us house")],
            key_cols=["state", "district", "special"],
            check_cols=["votes_DEM", "votes_GOP"],
            ignore_missing=(
                [
                    ("FL", 10, False),
                    ("FL", 14, False),
                    ("FL", 21, False),
                    ("FL", 24, False),
                ],
                [("NY", 25, True)],
            ),
            ignore_discrepancies=lambda k: k[0] == "ME",
        )
        e.validate_same(
            by_district[by_district.office == "us senate"],
            summary[(summary.year == 2018) & (summary.office == "us senate")],
            key_cols=["state", "district", "special"],
            check_cols=["votes_DEM", "votes_GOP"],
            ignore_discrepancies=lambda k: k[0] == "ME",
        )

        df = e.handle_uncontested(
            df,
            missing_counties=[(e.usa_county_to_fips("state")(county,
                                                             dict(state="FL")),
                               party) for county, party in [
                                   ("Hillsborough", "DEM"),
                                   ("Miami-Dade", "DEM"),
                                   ("Broward", "DEM"),
                                   ("Orange", "DEM"),
                               ]],
            missing_office="us house",
            replacement_offices=self.uncontested_replacements,
            fix_cols=["votes_DEM", "votes_GOP", "votes_other"],
            replacement_mode=self.uncontested_replacement_mode,
        )

        return df
    def get_direct(self):
        df_house = e.to_csv(
            e.download(
                "https://dataverse.harvard.edu/api/access/datafile/4202836"),
            sep="\t",
        )
        df_house = df_house[~(df_house.runoff == True)]
        del df_house["runoff"], df_house["fusion_ticket"]
        df_senate = e.to_csv(
            e.download(
                "https://dataverse.harvard.edu/api/access/datafile/4300300"),
            sep="\t",
        )
        df_senate["party"] = df_senate["party_simplified"]
        del df_senate["party_simplified"], df_senate["party_detailed"]
        assert sorted(df_house) == sorted(df_senate)
        df = pd.concat([df_house, df_senate])
        df = df[df.stage == "gen"]
        df.district = df.district.apply(lambda x: 1 if x == 0 else x)

        party_normalizer = e.usa_party_normalizer("candidate")
        party_normalizer.rewrite["aloha democratic"] = "other"
        party_normalizer.rewrite["independent-republican"] = "republican"
        party_normalizer.rewrite["national democrat"] = "democratic"
        party_normalizer.rewrite[
            "democratic-nonpartisan league"] = "democratic"
        party_normalizer.rewrite["foglietta (democrat)"] = "democratic"
        party_normalizer.rewrite["regular democracy"] = "other"
        party_normalizer.rewrite[
            "national democratic party of alabama"] = "democratic"
        party_normalizer.rewrite["democracy in action"] = "other"
        party_normalizer.rewrite["pro-democracy reform"] = "other"
        party_normalizer.rewrite["academic christian freedom"] = "other"
        party_normalizer.rewrite[
            "quality congressional representation"] = "other"
        party_normalizer.rewrite["representing the 99%"] = "other"
        party_normalizer.apply_to_df(df,
                                     "party",
                                     "party",
                                     var_name="party_normalizer")
        agg = e.Aggregator(
            grouped_columns=[
                "year", "state_po", "district", "party", "special"
            ],
            aggregation_functions={"candidatevotes": sum},
        )

        agg.removed_columns.append("candidate")
        agg.removed_columns.append("writein")

        df = agg(df)

        del df["mode"], df["totalvotes"], df["unofficial"]

        df = df.rename(columns={"candidatevotes": "votes"})
        df = e.columns_for_variable(df,
                                    values_are="votes",
                                    columns_for="party")
        df.columns = ["_".join(col).strip("_") for col in df.columns.values]

        df["state"] = df["state_po"]
        del df["state_po"]

        e.usa_office_normalizer().apply_to_df(df, "office", "office")

        # Pointwise fixes, CT from Ballotpedia, ME from NYT
        fixes = {
            "us house": [
                ("CT", 1, 175087, 96024),
                ("CT", 2, 179731, 102483),
                ("CT", 3, 174572, 95667),
                ("CT", 4, 168726, 106921),
                ("CT", 5, 151225, 119426),
                ("ME", 2, 131954, 134061),
            ],
            "us senate": [
                ("CT", "statewide", 825579, 545717),
                ("NY", "statewide", 4056931, 1998220),
                ("MS", "statewide", 386742, 389995 + 154878, True),
            ],
        }

        # Wikipedia
        wiki_ny = e.read_wikipedia(
            "https://en.wikipedia.org/wiki/2018_United_States_House_of_Representatives_elections_in_New_York",
            "Republican Hold",
        )
        for i, (district, dem_votes, gop_votes,
                _) in enumerate(np.array(wiki_ny)[:-1, [0, 1, 3, 5]]):
            assert district == f"District {i + 1}"
            fixes["us house"].append(("NY", i + 1, dem_votes, gop_votes))

        for office in fixes:
            for state, dist, dem, gop, *special in fixes[office]:
                if special:
                    [special] = special
                else:
                    special = False
                df.loc[(df.year == 2018)
                       & (df.state == state)
                       & (df.office == office)
                       & (df.district == dist)
                       & (df.special == special),
                       ["votes_DEM", "votes_GOP"], ] = [dem, gop]

        return df