def run_pipeline(indicator): # The Excel sheet to use sheet = INDICATOR_MAP[indicator.replace("yougov_", "")] # Read dataframe dataset = pandas.read_excel(request.content, sheet_name=sheet, header=2) dataset = dataset.rename(columns={"Country/region": "country"}) dataset = dataset.drop(columns=["region"]) # Stack dates dataset = dataset.set_index("country") dataset = dataset.stack() dataset = dataset.reset_index() dataset = dataset.rename(columns={"level_1": "date", 0: "value"}) # Normalize countries dataset["iso_code"] = dataset["country"].apply( lambda country: normalize_country(country)) # Normalize date # Drop any unnamed columns dataset["invalid_date"] = dataset["date"].apply( lambda date: type(date) == str and date.startswith("Unnamed")) dataset = dataset[dataset["invalid_date"] == False] dataset["date"] = dataset["date"].apply(lambda date: normalize_date( date if type(date) == str else date.strftime("%Y-%m-%d"), "%Y-%m-%d")) # Rename the value column dataset = dataset.rename(columns={"value": indicator}) # Create slice of data with country ID, date, and indicator dataset = dataset[["iso_code", "date", indicator]] save_indicator(indicator, dataset=dataset)
def run_pipeline(indicator): # Parse into dataframe url = repository_url + file_map[indicator] data = pd.read_csv(url) # Relabel country and state column data = data.rename(columns={ "Country/Region": "country", "Province/State": "state" }) # Keep country, state, and dates only data = data[["country", "state"] + [column for column in data if is_date(column, "%m/%d/%y")]] # Normalize country names: # * Drop non-countries data = data[~data.country.isin( ["Diamond Princess", "MS Zaandam", "Kosovo", "Summer Olympics 2020"])] # * Fix Taiwan: It is not clear why there is a star next to the name data["country"] = data["country"].replace({"Taiwan*": "Taiwan"}) # * Fix Micronesia: Micronesia refers to the Federated States data["country"] = data["country"].replace( {"Micronesia": "Micronesia, Federated States of"}) # * Perform conversion data["iso_code"] = data["country"].apply( lambda country: normalize_country(country)) # * Drop country name data = data.drop(columns=["country"]) # Reshape into list data = data.melt(id_vars=["iso_code", "state"]) data = data.rename(columns={"variable": "date", "value": indicator}) # Normalize date format data["date"] = data["date"].apply( lambda date: normalize_date(date, "%m/%d/%y")) # Verify uniqueness if data.duplicated(["iso_code", "state", "date"]).any(axis=None): raise Exception("Duplicates in data detected") # Collapse states into country data = data.groupby(["iso_code", "date"], as_index=False).agg("sum") # Save as CSV file save_indicator(indicator, dataset=data)
def run_pipeline(indicator): # Find the chart series for this indicator chart_series = None # The YouGov label to look for needle = INDICATOR_MAP[indicator.replace("yougov_", "")] for id, object in data_dict.items(): label = id.replace("chart_", "#") + " " + object["title"] if needle.lower() == label.lower(): chart_series = object["chartSeries"] break # Convert the data in the dict into a pandas dataframe: # The series contains the country name and a data object # The data object is a list of data points in the format [timestamp, value] data = [] for series in chart_series: for observation in series["data"]: data.append({ "country": series["name"], "date": strftime("%Y-%m-%d", gmtime(observation[0] / 1000)), "value": observation[1], }) # Convert dict to dataframe dataset = pandas.DataFrame.from_dict(data) # Normalize countries dataset["iso_code"] = dataset["country"].apply( lambda country: normalize_country(country)) # Normalize date dataset["date"] = dataset["date"].apply( lambda date: normalize_date(date, "%Y-%m-%d")) # Rename the value column dataset = dataset.rename(columns={"value": indicator}) # Create slice of data with country ID, date, and indicator dataset = dataset[["iso_code", "date", indicator]] save_indicator(indicator, dataset=dataset)
dataset_url = ( "https://covid.ourworldindata.org/data/excess_mortality/excess_mortality.csv" ) req = Request(dataset_url) req.add_header( "User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0", ) content = urlopen(req) # Parse into dataframe dataset = pd.read_csv(content) # Normalize date format dataset["date"] = dataset["date"].apply(lambda date: normalize_date(date, "%Y-%m-%d")) # Rename p-score columns new_column_names = {} for column in dataset.columns: new_column_names[column] = ( column.replace("p_scores_", "weekly_excess_mortality_p_score_") .replace("_all_ages", "") .replace("average_", "avg_") .replace("deaths_", "weekly_deaths_") ) dataset.rename(columns=new_column_names, inplace=True) # Normalize country names: # * Drop non countries
# Drop any rows with subnational observations dataset = dataset[dataset[[ "sub_region_1", "sub_region_2", "metro_area", "iso_3166_2_code", "census_fips_code", ]].isnull().all(axis="columns")] # Normalize countries dataset["iso_code"] = dataset["country_region"].apply( lambda country: normalize_country(country)) # Normalize date format dataset["date"] = dataset["date"].apply( lambda date: normalize_date(date, "%Y-%m-%d")) def run_pipeline(indicator): # Get the column name for this indicator column = (indicator.replace("google_mobility_change_", "", 1) + "_percent_change_from_baseline") # Create slice of data with country ID, date, and indicator frame = dataset[["iso_code", "date", column]] # Rename column to indicator frame = frame.rename(columns={column: indicator}) # Drop rows without observation frame = frame.dropna(subset=[indicator], axis="index")
from helpers.camel_case_to_snake_case import camel_case_to_snake_case dataset_url = "https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv" # Parse into dataframe dataset = pd.read_csv(dataset_url, low_memory=False) # Drop subnational data dataset = dataset[dataset["RegionCode"].isnull()] # Rename columns dataset = dataset.rename(columns={"CountryCode": "iso_code", "Date": "date"}) # Normalize date format dataset["date"] = dataset["date"].apply( lambda date: normalize_date(str(date), "%Y%m%d")) def normalize_column(column_name): normalized = column_name.replace(" ", "_").replace("/", "_or_") normalized = camel_case_to_snake_case(normalized) return "ox_" + normalized def run_pipeline(indicator): # Get the column name for this indicator column = next(c for c in list(dataset.columns) if normalize_column(c) == indicator) # Create slice of data with country ID, date, and indicator frame = dataset[["iso_code", "date", column]]
from helpers.save_indicator import save_indicator from helpers.camel_case_to_snake_case import camel_case_to_snake_case dataset_url = ( "https://raw.githubusercontent.com/RobertoFerC/SDSN_data_request/main/SDSN_Data.csv" ) # Parse into dataframe dataset = pd.read_csv(dataset_url, low_memory=False) # Rename columns dataset = dataset.rename(columns={"Country": "iso_code", "Date": "date"}) # Normalize date format dataset["date"] = dataset["date"].apply( lambda date: normalize_date(str(date), "%Y-%m-%d")) def run_pipeline(indicator): # Rename value column to indicator frame = dataset.rename(columns={indicator.replace("icl_", ""): indicator}) # Keep only relevant columns frame = frame[["iso_code", "date", indicator]] # Drop observations without indicator value frame = frame.dropna(subset=[indicator], axis="index") # Save as CSV file save_indicator(indicator, dataset=frame)