示例#1
0
def run_pipeline(indicator):
    # The Excel sheet to use
    sheet = INDICATOR_MAP[indicator.replace("yougov_", "")]

    # Read dataframe
    dataset = pandas.read_excel(request.content, sheet_name=sheet, header=2)
    dataset = dataset.rename(columns={"Country/region": "country"})
    dataset = dataset.drop(columns=["region"])

    # Stack dates
    dataset = dataset.set_index("country")
    dataset = dataset.stack()
    dataset = dataset.reset_index()
    dataset = dataset.rename(columns={"level_1": "date", 0: "value"})

    # Normalize countries
    dataset["iso_code"] = dataset["country"].apply(
        lambda country: normalize_country(country))

    # Normalize date
    # Drop any unnamed columns
    dataset["invalid_date"] = dataset["date"].apply(
        lambda date: type(date) == str and date.startswith("Unnamed"))
    dataset = dataset[dataset["invalid_date"] == False]
    dataset["date"] = dataset["date"].apply(lambda date: normalize_date(
        date if type(date) == str else date.strftime("%Y-%m-%d"), "%Y-%m-%d"))

    # Rename the value column
    dataset = dataset.rename(columns={"value": indicator})

    # Create slice of data with country ID, date, and indicator
    dataset = dataset[["iso_code", "date", indicator]]

    save_indicator(indicator, dataset=dataset)
示例#2
0
def run_pipeline(indicator):
    # Rename value column to indicator
    frame = dataset.rename(columns={indicator.replace("icl_", ""): indicator})

    # Keep only relevant columns
    frame = frame[["iso_code", "date", indicator]]

    # Drop observations without indicator value
    frame = frame.dropna(subset=[indicator], axis="index")

    # Save as CSV file
    save_indicator(indicator, dataset=frame)
示例#3
0
def run_pipeline(indicator):
    # Set column name from indicator
    column = indicator.replace("marioli_", "")

    # Create slice of data with country ID, date, and indicator
    frame = dataset[["iso_code", "date", column]]

    # Rename column to indicator
    frame = frame.rename(columns={column: indicator})

    # Drop rows without observation
    frame = frame.dropna(subset=[indicator], axis="index")

    save_indicator(indicator, dataset=frame)
def run_pipeline(indicator):
    # Get the column name for this indicator
    column = indicator.replace("owid_", "", 1)

    # Create slice of data with country ISO, date, and indicator
    frame = dataset[["iso_code", "date", column]]

    # Rename column to indicator
    frame = frame.rename(columns={column: indicator})

    # Drop rows without observation
    frame = frame.dropna(subset=[indicator], axis="index")

    # Save as CSV file
    save_indicator(indicator, dataset=frame)
示例#5
0
def run_pipeline(indicator):
    # Get the column name for this indicator
    column = (indicator.replace("google_mobility_change_", "", 1) +
              "_percent_change_from_baseline")

    # Create slice of data with country ID, date, and indicator
    frame = dataset[["iso_code", "date", column]]

    # Rename column to indicator
    frame = frame.rename(columns={column: indicator})

    # Drop rows without observation
    frame = frame.dropna(subset=[indicator], axis="index")

    # Save as CSV file
    save_indicator(indicator, dataset=frame)
示例#6
0
def run_pipeline(indicator):
    # Get the column name for this indicator
    column = next(c for c in list(dataset.columns)
                  if normalize_column(c) == indicator)

    # Create slice of data with country ID, date, and indicator
    frame = dataset[["iso_code", "date", column]]

    # Rename column to indicator
    frame = frame.rename(columns={column: indicator})

    # Drop rows without observation
    frame = frame.dropna(subset=[indicator], axis="index")

    # Save as CSV file
    save_indicator(indicator, dataset=frame)
示例#7
0
def run_pipeline(indicator):
    # Parse into dataframe
    url = repository_url + file_map[indicator]
    data = pd.read_csv(url)

    # Relabel country and state column
    data = data.rename(columns={
        "Country/Region": "country",
        "Province/State": "state"
    })

    # Keep country, state, and dates only
    data = data[["country", "state"] +
                [column for column in data if is_date(column, "%m/%d/%y")]]

    # Normalize country names:
    # * Drop non-countries
    data = data[~data.country.isin(
        ["Diamond Princess", "MS Zaandam", "Kosovo", "Summer Olympics 2020"])]
    # * Fix Taiwan: It is not clear why there is a star next to the name
    data["country"] = data["country"].replace({"Taiwan*": "Taiwan"})
    # * Fix Micronesia: Micronesia refers to the Federated States
    data["country"] = data["country"].replace(
        {"Micronesia": "Micronesia, Federated States of"})
    # * Perform conversion
    data["iso_code"] = data["country"].apply(
        lambda country: normalize_country(country))
    # * Drop country name
    data = data.drop(columns=["country"])

    # Reshape into list
    data = data.melt(id_vars=["iso_code", "state"])
    data = data.rename(columns={"variable": "date", "value": indicator})

    # Normalize date format
    data["date"] = data["date"].apply(
        lambda date: normalize_date(date, "%m/%d/%y"))

    # Verify uniqueness
    if data.duplicated(["iso_code", "state", "date"]).any(axis=None):
        raise Exception("Duplicates in data detected")

    # Collapse states into country
    data = data.groupby(["iso_code", "date"], as_index=False).agg("sum")

    # Save as CSV file
    save_indicator(indicator, dataset=data)
def run_pipeline(indicator):
    # Find the chart series for this indicator
    chart_series = None
    # The YouGov label to look for
    needle = INDICATOR_MAP[indicator.replace("yougov_", "")]
    for id, object in data_dict.items():
        label = id.replace("chart_", "#") + " " + object["title"]
        if needle.lower() == label.lower():
            chart_series = object["chartSeries"]
            break

    # Convert the data in the dict into a pandas dataframe:
    # The series contains the country name and a data object
    # The data object is a list of data points in the format [timestamp, value]
    data = []
    for series in chart_series:
        for observation in series["data"]:
            data.append({
                "country":
                series["name"],
                "date":
                strftime("%Y-%m-%d", gmtime(observation[0] / 1000)),
                "value":
                observation[1],
            })

    # Convert dict to dataframe
    dataset = pandas.DataFrame.from_dict(data)

    # Normalize countries
    dataset["iso_code"] = dataset["country"].apply(
        lambda country: normalize_country(country))

    # Normalize date
    dataset["date"] = dataset["date"].apply(
        lambda date: normalize_date(date, "%Y-%m-%d"))

    # Rename the value column
    dataset = dataset.rename(columns={"value": indicator})

    # Create slice of data with country ID, date, and indicator
    dataset = dataset[["iso_code", "date", indicator]]

    save_indicator(indicator, dataset=dataset)
示例#9
0
def run_pipeline(indicator):
    # Get the source indicator
    key = indicator.replace("sdsn_", "").replace("_smoothed", "")
    source_indicator = INDICATOR_MAP[key]

    # Load the source dataset
    path_to_source_data = os.path.join(INDICATOR_FOLDER,
                                       source_indicator + ".csv")
    dataset = pandas.read_csv(path_to_source_data)

    # Drop country name
    dataset = dataset[["iso_code", "date", source_indicator]]

    # Index on iso_code and date
    dataset["date"] = pandas.to_datetime(dataset["date"])
    dataset = dataset.set_index(["iso_code", "date"])

    # Identify first and last date
    today = strftime("%Y-%m-%d", localtime())
    dates = dataset.index.get_level_values("date")
    dates = dates.append(pandas.DatetimeIndex([pandas.to_datetime(today)]))

    # Fill missing days in dataset
    timerange = pandas.date_range(dates.min(), dates.max(), freq="D")
    dataset = dataset.reindex(
        pandas.MultiIndex.from_product([dataset.index.levels[0], timerange],
                                       names=["iso_code", "date"]), )

    # Generate rolling average over two week window, with at least 50% data
    # coverage. We use np.mean to average numbers, because it avoids weird
    # floating point issues when values are equal to zero:
    # See: https://groups.google.com/g/pydata/c/Bl7QLr-Y5Z0
    dataset[indicator] = (dataset.reset_index(
        level="iso_code").groupby("iso_code")[source_indicator].rolling(
            "14D", min_periods=7).apply(np.mean))

    # Reshape index back into columns
    dataset.reset_index(inplace=True)

    # Drop rows without observations
    dataset = dataset.dropna(subset=[indicator], axis="index")

    # Save smoothed indicator
    save_indicator(indicator, dataset=dataset)
def run_pipeline(indicator):
    # Get smoothed positive rate
    positive_rate_path = os.path.join(INDICATOR_FOLDER,
                                      "owid_positive_rate.csv")
    dataset = pandas.read_csv(positive_rate_path)

    # Index on iso_code and date
    dataset["date"] = pandas.to_datetime(dataset["date"])
    dataset = dataset.set_index(["iso_code", "date"])

    # Identify first and last date
    today = strftime("%Y-%m-%d", localtime())
    dates = dataset.index.get_level_values("date")
    dates = dates.append(pandas.DatetimeIndex([pandas.to_datetime(today)]))

    # Fill missing days in dataset
    timerange = pandas.date_range(dates.min(), dates.max(), freq="D")
    dataset = dataset.reindex(
        pandas.MultiIndex.from_product([dataset.index.levels[0], timerange],
                                       names=["iso_code", "date"]), )

    # Carry-forward the latest positive test rate for each country for up to
    # seven days
    dataset["carried_positive_rate_smoothed"] = (dataset.reset_index(
        level="iso_code").groupby("iso_code")["owid_positive_rate"].apply(
            lambda x: x.loc[x.last_valid_index():].fillna(method="ffill",
                                                          limit=7)))

    # Merge carried positive rate into the positive rate column
    dataset[indicator] = dataset["owid_positive_rate"].combine_first(
        dataset["carried_positive_rate_smoothed"])

    # Reshape index back into columns
    dataset.reset_index(inplace=True)

    # Drop country name
    dataset = dataset[["iso_code", "date", indicator]]

    # Drop rows without classification
    dataset = dataset.dropna(subset=[indicator], axis="index")

    # Save as CSV file
    save_indicator(indicator, dataset=dataset)
def run_pipeline(indicator):
    # Get smoothed new cases per million
    new_cases_per_million_path = os.path.join(
        INDICATOR_FOLDER, "sdsn_new_cases_per_million_smoothed.csv"
    )
    new_cases_per_million = pandas.read_csv(new_cases_per_million_path)
    new_cases_per_million.rename(
        columns={
            "sdsn_new_cases_per_million_smoothed": "new_cases_per_million_smoothed"
        },
        inplace=True,
    )

    # Get smoothed positive rate
    positive_rate_path = os.path.join(
        INDICATOR_FOLDER, "sdsn_positive_test_rate_smoothed.csv"
    )
    positive_rate = pandas.read_csv(positive_rate_path)
    positive_rate.rename(
        columns={"sdsn_positive_test_rate_smoothed": "positive_rate_smoothed"},
        inplace=True,
    )

    # Merge datasets
    dataset = pandas.merge(
        new_cases_per_million,
        positive_rate,
        how="outer",
        on=["iso_code", "country", "date"],
    )

    # Generate classification
    dataset[indicator] = dataset.apply(generate_classification, axis=1)

    # Drop country name
    dataset = dataset[["iso_code", "date", indicator]]

    # Drop rows without classification
    dataset = dataset.dropna(subset=[indicator], axis="index")

    # Save as CSV file
    save_indicator(indicator, dataset=dataset)