def process_and_export(): """Either caching disabled or file not yet processed; process regardless.""" data = {} for metric in ["Confirmed", "Deaths", "Recovered"]: df = pd.read_csv(self.directory / "raw" / f"time_series_19-covid-{metric}.csv") # Pivot all to long id_vars = ["Province/State", "Country/Region", "Lat", "Long"] value_vars = list(set(df.columns) - set(id_vars)) df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name="date", value_name=metric) df["date"] = pd.to_datetime(df.date, format="%m/%d/%y") data[metric] = df.copy() # Merge together df_country_province = pd.merge( data["Confirmed"], data["Deaths"], how="outer", on=["Province/State", "Country/Region", "Lat", "Long", "date"], ).merge( data["Recovered"], how="outer", on=["Province/State", "Country/Region", "Lat", "Long", "date"], ) # Clean df_country_province.columns = utils.sanitise( df_country_province.columns, replace={"long": "lon"}) df_country_province = df_country_province[[ "date", "country_region", "province_state", "lat", "lon", "confirmed", "deaths", "recovered", ]].sort_values(["date", "country_region", "province_state"]) # Country-level data df_country = (df_country_province.groupby([ "date", "country_region" ])[["confirmed", "deaths", "recovered"]].sum().reset_index()) # Export print(f"Exporting dataset to {target_dir.resolve()}") df_country_province.to_csv(target_dir / "CSSE_country_province.csv", index=False) df_country.to_csv(target_dir / "CSSE_country.csv", index=False)
def load_polling_data(self): """Load polling data for UK General Elections.""" polls = {} for geo in self.geos: poll_df = pd.read_csv( self.directory / "raw" / f"general_election-{geo}-polls.csv", parse_dates=["to"] ).sort_values("to") poll_df.columns = utils.sanitise( poll_df.columns, replace={"ulster_unionist_party": "uup", "sinn_fein": "sf", "alliance": "apni"}, ) polls[geo] = poll_df return polls
def test_sanitise(): assert utils.sanitise("Vote Count") == "vote_count"
def process_hoc_sheet(input_file, data_dir, sheet_name): # Import general election results print(f"Read and clean {input_file}") parties = [ "Con", "LD", "Lab", "UKIP", "Grn", "SNP", "PC", "DUP", "SF", "SDLP", "UUP", "APNI", "Other", ] results = pd.read_excel( data_dir / "raw" / input_file, sheet_name=sheet_name, skiprows=4, header=None, skipfooter=19, ) assert results.shape == (650, 49) # Specify columns (spread across multiple rows in Excel) cols = ["", "id", "Constituency", "County", "Country/Region", "Country", "Electorate", ""] for party in parties: cols += [f"{party}_Votes", f"{party}_Voteshare", ""] cols += ["Total votes", "Turnout"] results.columns = cols # Some basic data quality checks for party in parties: assert ( results[f"{party}_Voteshare"] - results[f"{party}_Votes"] / results["Total votes"] ).sum() == 0 assert ( results[[f"{party}_Votes" for party in parties]].fillna(0.0).sum(axis=1) == results["Total votes"] ).all() assert ((results["Total votes"] / results["Electorate"]) == results["Turnout"]).all() # Drop blank columns plus those that can be calculated cols_to_drop = [""] + [c for c in cols if "Voteshare" in c] + ["Total votes", "Turnout"] results = results.drop(columns=cols_to_drop) # Sanitise column names results.columns = utils.sanitise(results.columns) results = results.rename(columns={"id": "ons_id", "country_region": "region"}) results.columns = [c.replace("_votes", "") for c in results.columns] # Reshape to long results_long = pd.melt( results, id_vars=["ons_id", "constituency", "county", "region", "country", "electorate"], var_name="party", value_name="votes", ) assert results.shape == (650, 19) assert results_long.shape == (650 * len(parties), 19 - len(parties) + 2) # Sort by (ons_id, party) results_long["party"] = pd.Categorical( results_long.party, categories=pd.Series(parties).apply(utils.sanitise), ordered=True ) results_long = results_long.sort_values(["ons_id", "party"]).reset_index(drop=True) # Re-add total_votes & voteshare results_long["total_votes"] = results_long.ons_id.map( results_long.groupby("ons_id").votes.sum().astype(int) ) results_long["voteshare"] = results_long["votes"] / results_long["total_votes"] results_long["turnout"] = results_long["total_votes"] / results_long["electorate"] # Reorder cols for export results_long = results_long[ [ "ons_id", "constituency", "county", "region", "country", "electorate", "total_votes", "turnout", "party", "votes", "voteshare", ] ].copy() return results_long
def process_and_export(): # Read in PollBase df = pd.read_excel( self.directory / "raw" / filename, sheet_name="17-19", usecols="A:C,G:H,I,K,M,O,Q,S,U,Y", ) # Clean it up df.columns = utils.sanitise( df.columns, replace={ "polling": "company", "publisher": "client", "unnamed:_24": "method", "green": "grn", "tig_cuk": "chuk", }, ) df["year"] = df.year.replace({"?": 2019}).ffill().astype(int) df["month"] = df.month.ffill() df = df[df["fieldwork"].notnull()].copy() df["day_from"] = df.fieldwork.apply( lambda x: str(x).split("-")[0].replace("?", "") if "-" in str(x) else str(x).replace("?", "")) df["day_to"] = df.fieldwork.apply( lambda x: str(x).split("-")[1].replace("?", "") if "-" in str(x) else str(x).replace("?", "")) df["from"] = pd.to_datetime( df.apply(lambda row: f"{row.year}-{row.month}-{row.day_from}", axis=1)) df["to"] = pd.to_datetime( df.apply(lambda row: f"{row.year}-{row.month}-{row.day_to}", axis=1)) # Fix month & year in df['to'] where e.g. fieldwork is "30-3 Jan" month_shifted = (df.year.astype(str) + "-" + ( (df.to.dt.month + 1) % 12).astype(str).replace("0", "12") + "-" + df.day_to.astype(str)) year_needs_shifting = month_shifted.apply( lambda x: str(x).split("-")[1]) == "1" month_shifted.loc[year_needs_shifting] = ( ((df.loc[year_needs_shifting, "year"]).astype(int) + 1).astype(str).replace("0", "12") + "-" + ((df.to.dt.month + 1) % 12).astype(str) + "-" + df.day_to.astype(str)) df.loc[df["from"] > df["to"], "to"] = month_shifted.loc[df["from"] > df["to"]] df["to"] = pd.to_datetime(df.to) # Divide numbers by 100 for party in ["con", "lab", "ld", "ukip", "grn", "chuk", "bxp"]: df[party] = df[party].replace(" ", np.nan).astype(float) / 100 # Prepare for merge with SixFifty data df["sample_size"] = np.nan df["snp"] = np.nan df["pdf"] = np.nan columns = [ "company", "client", "method", "from", "to", "sample_size", "con", "lab", "ld", "ukip", "grn", "chuk", "bxp", "snp", "pdf", ] df = df[columns].copy().sort_values("to") # Read in SixFifty polling data (2005 -> June 2017) df_sixfifty = pd.read_csv(self.directory / "raw" / "polls.csv", parse_dates=["from", "to"]) df_sixfifty["chuk"] = np.nan df_sixfifty["bxp"] = np.nan df_sixfifty = df_sixfifty[columns].copy().sort_values("to") # Merge df_sixfifty = df_sixfifty[df_sixfifty.to < df.to.min()].copy() assert df_sixfifty.to.max() < df.to.min() df_polls = pd.concat([df_sixfifty, df], axis=0) # Export print( f"Exporting dataset to {processed_results_location.resolve()}") df_polls.to_csv(processed_results_location, index=False)