예제 #1
0
def run_pipeline(indicator):
    # The Excel sheet to use
    sheet = INDICATOR_MAP[indicator.replace("yougov_", "")]

    # Read dataframe
    dataset = pandas.read_excel(request.content, sheet_name=sheet, header=2)
    dataset = dataset.rename(columns={"Country/region": "country"})
    dataset = dataset.drop(columns=["region"])

    # Stack dates
    dataset = dataset.set_index("country")
    dataset = dataset.stack()
    dataset = dataset.reset_index()
    dataset = dataset.rename(columns={"level_1": "date", 0: "value"})

    # Normalize countries
    dataset["iso_code"] = dataset["country"].apply(
        lambda country: normalize_country(country))

    # Normalize date
    # Drop any unnamed columns
    dataset["invalid_date"] = dataset["date"].apply(
        lambda date: type(date) == str and date.startswith("Unnamed"))
    dataset = dataset[dataset["invalid_date"] == False]
    dataset["date"] = dataset["date"].apply(lambda date: normalize_date(
        date if type(date) == str else date.strftime("%Y-%m-%d"), "%Y-%m-%d"))

    # Rename the value column
    dataset = dataset.rename(columns={"value": indicator})

    # Create slice of data with country ID, date, and indicator
    dataset = dataset[["iso_code", "date", indicator]]

    save_indicator(indicator, dataset=dataset)
예제 #2
0
def run_pipeline(indicator):
    # Parse into dataframe
    url = repository_url + file_map[indicator]
    data = pd.read_csv(url)

    # Relabel country and state column
    data = data.rename(columns={
        "Country/Region": "country",
        "Province/State": "state"
    })

    # Keep country, state, and dates only
    data = data[["country", "state"] +
                [column for column in data if is_date(column, "%m/%d/%y")]]

    # Normalize country names:
    # * Drop non-countries
    data = data[~data.country.isin(
        ["Diamond Princess", "MS Zaandam", "Kosovo", "Summer Olympics 2020"])]
    # * Fix Taiwan: It is not clear why there is a star next to the name
    data["country"] = data["country"].replace({"Taiwan*": "Taiwan"})
    # * Fix Micronesia: Micronesia refers to the Federated States
    data["country"] = data["country"].replace(
        {"Micronesia": "Micronesia, Federated States of"})
    # * Perform conversion
    data["iso_code"] = data["country"].apply(
        lambda country: normalize_country(country))
    # * Drop country name
    data = data.drop(columns=["country"])

    # Reshape into list
    data = data.melt(id_vars=["iso_code", "state"])
    data = data.rename(columns={"variable": "date", "value": indicator})

    # Normalize date format
    data["date"] = data["date"].apply(
        lambda date: normalize_date(date, "%m/%d/%y"))

    # Verify uniqueness
    if data.duplicated(["iso_code", "state", "date"]).any(axis=None):
        raise Exception("Duplicates in data detected")

    # Collapse states into country
    data = data.groupby(["iso_code", "date"], as_index=False).agg("sum")

    # Save as CSV file
    save_indicator(indicator, dataset=data)
def run_pipeline(indicator):
    # Find the chart series for this indicator
    chart_series = None
    # The YouGov label to look for
    needle = INDICATOR_MAP[indicator.replace("yougov_", "")]
    for id, object in data_dict.items():
        label = id.replace("chart_", "#") + " " + object["title"]
        if needle.lower() == label.lower():
            chart_series = object["chartSeries"]
            break

    # Convert the data in the dict into a pandas dataframe:
    # The series contains the country name and a data object
    # The data object is a list of data points in the format [timestamp, value]
    data = []
    for series in chart_series:
        for observation in series["data"]:
            data.append({
                "country":
                series["name"],
                "date":
                strftime("%Y-%m-%d", gmtime(observation[0] / 1000)),
                "value":
                observation[1],
            })

    # Convert dict to dataframe
    dataset = pandas.DataFrame.from_dict(data)

    # Normalize countries
    dataset["iso_code"] = dataset["country"].apply(
        lambda country: normalize_country(country))

    # Normalize date
    dataset["date"] = dataset["date"].apply(
        lambda date: normalize_date(date, "%Y-%m-%d"))

    # Rename the value column
    dataset = dataset.rename(columns={"value": indicator})

    # Create slice of data with country ID, date, and indicator
    dataset = dataset[["iso_code", "date", indicator]]

    save_indicator(indicator, dataset=dataset)
dataset_url = (
    "https://covid.ourworldindata.org/data/excess_mortality/excess_mortality.csv"
)

req = Request(dataset_url)
req.add_header(
    "User-Agent",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0",
)
content = urlopen(req)

# Parse into dataframe
dataset = pd.read_csv(content)

# Normalize date format
dataset["date"] = dataset["date"].apply(lambda date: normalize_date(date, "%Y-%m-%d"))

# Rename p-score columns
new_column_names = {}
for column in dataset.columns:
    new_column_names[column] = (
        column.replace("p_scores_", "weekly_excess_mortality_p_score_")
        .replace("_all_ages", "")
        .replace("average_", "avg_")
        .replace("deaths_", "weekly_deaths_")
    )

dataset.rename(columns=new_column_names, inplace=True)

# Normalize country names:
# * Drop non countries
예제 #5
0
# Drop any rows with subnational observations
dataset = dataset[dataset[[
    "sub_region_1",
    "sub_region_2",
    "metro_area",
    "iso_3166_2_code",
    "census_fips_code",
]].isnull().all(axis="columns")]

# Normalize countries
dataset["iso_code"] = dataset["country_region"].apply(
    lambda country: normalize_country(country))

# Normalize date format
dataset["date"] = dataset["date"].apply(
    lambda date: normalize_date(date, "%Y-%m-%d"))


def run_pipeline(indicator):
    # Get the column name for this indicator
    column = (indicator.replace("google_mobility_change_", "", 1) +
              "_percent_change_from_baseline")

    # Create slice of data with country ID, date, and indicator
    frame = dataset[["iso_code", "date", column]]

    # Rename column to indicator
    frame = frame.rename(columns={column: indicator})

    # Drop rows without observation
    frame = frame.dropna(subset=[indicator], axis="index")
예제 #6
0
from helpers.camel_case_to_snake_case import camel_case_to_snake_case

dataset_url = "https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv"

# Parse into dataframe
dataset = pd.read_csv(dataset_url, low_memory=False)

# Drop subnational data
dataset = dataset[dataset["RegionCode"].isnull()]

# Rename columns
dataset = dataset.rename(columns={"CountryCode": "iso_code", "Date": "date"})

# Normalize date format
dataset["date"] = dataset["date"].apply(
    lambda date: normalize_date(str(date), "%Y%m%d"))


def normalize_column(column_name):
    normalized = column_name.replace(" ", "_").replace("/", "_or_")
    normalized = camel_case_to_snake_case(normalized)
    return "ox_" + normalized


def run_pipeline(indicator):
    # Get the column name for this indicator
    column = next(c for c in list(dataset.columns)
                  if normalize_column(c) == indicator)

    # Create slice of data with country ID, date, and indicator
    frame = dataset[["iso_code", "date", column]]
예제 #7
0
from helpers.save_indicator import save_indicator
from helpers.camel_case_to_snake_case import camel_case_to_snake_case

dataset_url = (
    "https://raw.githubusercontent.com/RobertoFerC/SDSN_data_request/main/SDSN_Data.csv"
)

# Parse into dataframe
dataset = pd.read_csv(dataset_url, low_memory=False)

# Rename columns
dataset = dataset.rename(columns={"Country": "iso_code", "Date": "date"})

# Normalize date format
dataset["date"] = dataset["date"].apply(
    lambda date: normalize_date(str(date), "%Y-%m-%d"))


def run_pipeline(indicator):
    # Rename value column to indicator
    frame = dataset.rename(columns={indicator.replace("icl_", ""): indicator})

    # Keep only relevant columns
    frame = frame[["iso_code", "date", indicator]]

    # Drop observations without indicator value
    frame = frame.dropna(subset=[indicator], axis="index")

    # Save as CSV file
    save_indicator(indicator, dataset=frame)