def run_pipeline(indicator): # The Excel sheet to use sheet = INDICATOR_MAP[indicator.replace("yougov_", "")] # Read dataframe dataset = pandas.read_excel(request.content, sheet_name=sheet, header=2) dataset = dataset.rename(columns={"Country/region": "country"}) dataset = dataset.drop(columns=["region"]) # Stack dates dataset = dataset.set_index("country") dataset = dataset.stack() dataset = dataset.reset_index() dataset = dataset.rename(columns={"level_1": "date", 0: "value"}) # Normalize countries dataset["iso_code"] = dataset["country"].apply( lambda country: normalize_country(country)) # Normalize date # Drop any unnamed columns dataset["invalid_date"] = dataset["date"].apply( lambda date: type(date) == str and date.startswith("Unnamed")) dataset = dataset[dataset["invalid_date"] == False] dataset["date"] = dataset["date"].apply(lambda date: normalize_date( date if type(date) == str else date.strftime("%Y-%m-%d"), "%Y-%m-%d")) # Rename the value column dataset = dataset.rename(columns={"value": indicator}) # Create slice of data with country ID, date, and indicator dataset = dataset[["iso_code", "date", indicator]] save_indicator(indicator, dataset=dataset)
def run_pipeline(indicator): # Rename value column to indicator frame = dataset.rename(columns={indicator.replace("icl_", ""): indicator}) # Keep only relevant columns frame = frame[["iso_code", "date", indicator]] # Drop observations without indicator value frame = frame.dropna(subset=[indicator], axis="index") # Save as CSV file save_indicator(indicator, dataset=frame)
def run_pipeline(indicator): # Set column name from indicator column = indicator.replace("marioli_", "") # Create slice of data with country ID, date, and indicator frame = dataset[["iso_code", "date", column]] # Rename column to indicator frame = frame.rename(columns={column: indicator}) # Drop rows without observation frame = frame.dropna(subset=[indicator], axis="index") save_indicator(indicator, dataset=frame)
def run_pipeline(indicator): # Get the column name for this indicator column = indicator.replace("owid_", "", 1) # Create slice of data with country ISO, date, and indicator frame = dataset[["iso_code", "date", column]] # Rename column to indicator frame = frame.rename(columns={column: indicator}) # Drop rows without observation frame = frame.dropna(subset=[indicator], axis="index") # Save as CSV file save_indicator(indicator, dataset=frame)
def run_pipeline(indicator): # Get the column name for this indicator column = (indicator.replace("google_mobility_change_", "", 1) + "_percent_change_from_baseline") # Create slice of data with country ID, date, and indicator frame = dataset[["iso_code", "date", column]] # Rename column to indicator frame = frame.rename(columns={column: indicator}) # Drop rows without observation frame = frame.dropna(subset=[indicator], axis="index") # Save as CSV file save_indicator(indicator, dataset=frame)
def run_pipeline(indicator): # Get the column name for this indicator column = next(c for c in list(dataset.columns) if normalize_column(c) == indicator) # Create slice of data with country ID, date, and indicator frame = dataset[["iso_code", "date", column]] # Rename column to indicator frame = frame.rename(columns={column: indicator}) # Drop rows without observation frame = frame.dropna(subset=[indicator], axis="index") # Save as CSV file save_indicator(indicator, dataset=frame)
def run_pipeline(indicator): # Parse into dataframe url = repository_url + file_map[indicator] data = pd.read_csv(url) # Relabel country and state column data = data.rename(columns={ "Country/Region": "country", "Province/State": "state" }) # Keep country, state, and dates only data = data[["country", "state"] + [column for column in data if is_date(column, "%m/%d/%y")]] # Normalize country names: # * Drop non-countries data = data[~data.country.isin( ["Diamond Princess", "MS Zaandam", "Kosovo", "Summer Olympics 2020"])] # * Fix Taiwan: It is not clear why there is a star next to the name data["country"] = data["country"].replace({"Taiwan*": "Taiwan"}) # * Fix Micronesia: Micronesia refers to the Federated States data["country"] = data["country"].replace( {"Micronesia": "Micronesia, Federated States of"}) # * Perform conversion data["iso_code"] = data["country"].apply( lambda country: normalize_country(country)) # * Drop country name data = data.drop(columns=["country"]) # Reshape into list data = data.melt(id_vars=["iso_code", "state"]) data = data.rename(columns={"variable": "date", "value": indicator}) # Normalize date format data["date"] = data["date"].apply( lambda date: normalize_date(date, "%m/%d/%y")) # Verify uniqueness if data.duplicated(["iso_code", "state", "date"]).any(axis=None): raise Exception("Duplicates in data detected") # Collapse states into country data = data.groupby(["iso_code", "date"], as_index=False).agg("sum") # Save as CSV file save_indicator(indicator, dataset=data)
def run_pipeline(indicator): # Find the chart series for this indicator chart_series = None # The YouGov label to look for needle = INDICATOR_MAP[indicator.replace("yougov_", "")] for id, object in data_dict.items(): label = id.replace("chart_", "#") + " " + object["title"] if needle.lower() == label.lower(): chart_series = object["chartSeries"] break # Convert the data in the dict into a pandas dataframe: # The series contains the country name and a data object # The data object is a list of data points in the format [timestamp, value] data = [] for series in chart_series: for observation in series["data"]: data.append({ "country": series["name"], "date": strftime("%Y-%m-%d", gmtime(observation[0] / 1000)), "value": observation[1], }) # Convert dict to dataframe dataset = pandas.DataFrame.from_dict(data) # Normalize countries dataset["iso_code"] = dataset["country"].apply( lambda country: normalize_country(country)) # Normalize date dataset["date"] = dataset["date"].apply( lambda date: normalize_date(date, "%Y-%m-%d")) # Rename the value column dataset = dataset.rename(columns={"value": indicator}) # Create slice of data with country ID, date, and indicator dataset = dataset[["iso_code", "date", indicator]] save_indicator(indicator, dataset=dataset)
def run_pipeline(indicator): # Get the source indicator key = indicator.replace("sdsn_", "").replace("_smoothed", "") source_indicator = INDICATOR_MAP[key] # Load the source dataset path_to_source_data = os.path.join(INDICATOR_FOLDER, source_indicator + ".csv") dataset = pandas.read_csv(path_to_source_data) # Drop country name dataset = dataset[["iso_code", "date", source_indicator]] # Index on iso_code and date dataset["date"] = pandas.to_datetime(dataset["date"]) dataset = dataset.set_index(["iso_code", "date"]) # Identify first and last date today = strftime("%Y-%m-%d", localtime()) dates = dataset.index.get_level_values("date") dates = dates.append(pandas.DatetimeIndex([pandas.to_datetime(today)])) # Fill missing days in dataset timerange = pandas.date_range(dates.min(), dates.max(), freq="D") dataset = dataset.reindex( pandas.MultiIndex.from_product([dataset.index.levels[0], timerange], names=["iso_code", "date"]), ) # Generate rolling average over two week window, with at least 50% data # coverage. We use np.mean to average numbers, because it avoids weird # floating point issues when values are equal to zero: # See: https://groups.google.com/g/pydata/c/Bl7QLr-Y5Z0 dataset[indicator] = (dataset.reset_index( level="iso_code").groupby("iso_code")[source_indicator].rolling( "14D", min_periods=7).apply(np.mean)) # Reshape index back into columns dataset.reset_index(inplace=True) # Drop rows without observations dataset = dataset.dropna(subset=[indicator], axis="index") # Save smoothed indicator save_indicator(indicator, dataset=dataset)
def run_pipeline(indicator): # Get smoothed positive rate positive_rate_path = os.path.join(INDICATOR_FOLDER, "owid_positive_rate.csv") dataset = pandas.read_csv(positive_rate_path) # Index on iso_code and date dataset["date"] = pandas.to_datetime(dataset["date"]) dataset = dataset.set_index(["iso_code", "date"]) # Identify first and last date today = strftime("%Y-%m-%d", localtime()) dates = dataset.index.get_level_values("date") dates = dates.append(pandas.DatetimeIndex([pandas.to_datetime(today)])) # Fill missing days in dataset timerange = pandas.date_range(dates.min(), dates.max(), freq="D") dataset = dataset.reindex( pandas.MultiIndex.from_product([dataset.index.levels[0], timerange], names=["iso_code", "date"]), ) # Carry-forward the latest positive test rate for each country for up to # seven days dataset["carried_positive_rate_smoothed"] = (dataset.reset_index( level="iso_code").groupby("iso_code")["owid_positive_rate"].apply( lambda x: x.loc[x.last_valid_index():].fillna(method="ffill", limit=7))) # Merge carried positive rate into the positive rate column dataset[indicator] = dataset["owid_positive_rate"].combine_first( dataset["carried_positive_rate_smoothed"]) # Reshape index back into columns dataset.reset_index(inplace=True) # Drop country name dataset = dataset[["iso_code", "date", indicator]] # Drop rows without classification dataset = dataset.dropna(subset=[indicator], axis="index") # Save as CSV file save_indicator(indicator, dataset=dataset)
def run_pipeline(indicator): # Get smoothed new cases per million new_cases_per_million_path = os.path.join( INDICATOR_FOLDER, "sdsn_new_cases_per_million_smoothed.csv" ) new_cases_per_million = pandas.read_csv(new_cases_per_million_path) new_cases_per_million.rename( columns={ "sdsn_new_cases_per_million_smoothed": "new_cases_per_million_smoothed" }, inplace=True, ) # Get smoothed positive rate positive_rate_path = os.path.join( INDICATOR_FOLDER, "sdsn_positive_test_rate_smoothed.csv" ) positive_rate = pandas.read_csv(positive_rate_path) positive_rate.rename( columns={"sdsn_positive_test_rate_smoothed": "positive_rate_smoothed"}, inplace=True, ) # Merge datasets dataset = pandas.merge( new_cases_per_million, positive_rate, how="outer", on=["iso_code", "country", "date"], ) # Generate classification dataset[indicator] = dataset.apply(generate_classification, axis=1) # Drop country name dataset = dataset[["iso_code", "date", indicator]] # Drop rows without classification dataset = dataset.dropna(subset=[indicator], axis="index") # Save as CSV file save_indicator(indicator, dataset=dataset)