Пример #1
0
def check_data_correctness(filename):
    errors = 0
    df_merged = _load_merged(filename)
    df_uniq = df_merged[['countriesAndTerritories', 'geoId', 'location']].drop_duplicates()
    if df_uniq['location'].isnull().any():
        print("\n" + ERROR + " Could not find OWID names for:")
        print(df_uniq[df_uniq['location'].isnull()])
        csv_path = os.path.join(TMP_PATH, 'ecdc.csv')
        os.system('mkdir -p %s' % os.path.abspath(TMP_PATH))
        df_uniq[['countriesAndTerritories']] \
            .drop_duplicates() \
            .rename(columns={'countriesAndTerritories': 'Country'}) \
            .to_csv(csv_path, index=False)
        print("\nSaved CSV file to be standardized at %s. \nRun it through the OWID standardizer and save in %s" % (
            colored(os.path.abspath(csv_path), 'magenta'),
            colored(os.path.abspath(LOCATIONS_CSV_PATH), 'magenta')
        ))
        errors += 1
    # Drop missing locations for the further checks – that error is addressed above
    df_merged = df_merged.dropna(subset=['location'])
    if df_merged.duplicated(subset=['dateRep', 'location']).any():
        print("\n" + ERROR + " Found duplicate rows:")
        print(df_merged[df_merged.duplicated(subset=['dateRep', 'location'])])
        print("\nPlease " + colored("fix or remove the duplicate rows", 'magenta') + " in the Excel file, and then save it again but under a new name, e.g. 2020-03-20-modified.xlsx")
        print("Also please " + colored("note down any changes you made", 'magenta') + " in %s" % os.path.abspath(os.path.join(INPUT_PATH, 'NOTES.md')))
        errors += 1
    df_pop = load_population()
    pop_entity_diff = set(df_uniq['location']) - set(df_pop['location'])
    if len(pop_entity_diff) > 0:
        # this is not an error, so don't increment errors variable
        print("\n" + WARNING + " These entities were not found in the population dataset:")
        print(pop_entity_diff)
        print()
    return True if errors == 0 else False
Пример #2
0
def check_data_correctness(df_merged):
    errors = 0

    # Check that every country name is standardized
    df_uniq = df_merged[["Country/Region", "location"]].drop_duplicates()
    if df_uniq["location"].isnull().any():
        print_err("\n" + ERROR + " Could not find OWID names for:")
        print_err(df_uniq[df_uniq["location"].isnull()])
        errors += 1

    # Drop missing locations for the further checks – that error is addressed above
    df_merged = df_merged.dropna(subset=["location"])

    # Check for duplicate rows
    if df_merged.duplicated(subset=["date", "location"]).any():
        print_err("\n" + ERROR + " Found duplicate rows:")
        print_err(df_merged[df_merged.duplicated(subset=["date", "location"])])
        errors += 1

    # Check for missing population figures
    df_pop = load_population()
    pop_entity_diff = set(df_uniq["location"]) - set(df_pop["location"]) - set(["International"])
    if len(pop_entity_diff) > 0:
        # this is not an error, so don't increment errors variable
        print("\n" + WARNING + " These entities were not found in the population dataset:")
        print(pop_entity_diff)
        print()
        formatted_msg = ", ".join([f"`{entity}`" for entity in pop_entity_diff])
        send_warning(
            channel="corona-data-updates",
            title="Some entities are missing from the population dataset",
            message=formatted_msg
        )

    return True if errors == 0 else False
Пример #3
0
def check_data_correctness():
    errors = 0
    df_merged = _load_merged()
    df_uniq = df_merged[['Countries and territories', 'GeoId',
                         'location']].drop_duplicates()
    df_pop = load_population()
    if df_uniq['location'].isnull().any():
        print("Error: Could not find OWID names for:")
        print(df_uniq[df_uniq['location'].isnull()])
        csv_path = os.path.join(TMP_PATH, 'ecdc_locations.csv')
        print(os.path.abspath(TMP_PATH))
        os.system('mkdir -p %s' % os.path.abspath(TMP_PATH))
        df_uniq \
            .sort_values(by=['location']) \
            .to_csv(csv_path, index=False)
        print("Saved CSV file to be standardized at %s" % csv_path)
        errors += 1
    if df_merged.duplicated(subset=['DateRep', 'location']).any():
        print("Found duplicate rows:")
        print(df_merged[df_merged.duplicated(
            subset=['DateRep', 'Our World in Data'])])
        errors += 1
    pop_entity_diff = set(df_uniq['location']) - set(df_pop['location'])
    if len(pop_entity_diff) > 0:
        # this is not an error, so don't increment errors variable
        print("These entities were not found in the population dataset:")
        print(pop_entity_diff)
    return True if errors == 0 else False
Пример #4
0
def check_data_correctness(filename):
    errors = 0

    df_merged = _load_merged(filename)

    # Check that every country name is standardized
    df_uniq = df_merged[['countriesAndTerritories', 'geoId',
                         'location']].drop_duplicates()
    if df_uniq['location'].isnull().any():
        print_err("\n" + ERROR + " Could not find OWID names for:")
        print_err(df_uniq[df_uniq['location'].isnull()])
        csv_path = os.path.join(TMP_PATH, 'ecdc.csv')
        os.system('mkdir -p %s' % os.path.abspath(TMP_PATH))
        df_uniq[['countriesAndTerritories']] \
            .drop_duplicates() \
            .rename(columns={'countriesAndTerritories': 'Country'}) \
            .to_csv(csv_path, index=False)
        print_err(
            "\nSaved CSV file to be standardized at %s. \nRun it through the OWID standardizer and save in %s"
            % (colored(os.path.abspath(csv_path), 'magenta'),
               colored(os.path.abspath(LOCATIONS_CSV_PATH), 'magenta')))
        errors += 1

    # Drop missing locations for the further checks – that error is addressed above
    df_merged = df_merged.dropna(subset=['location'])

    # Check for duplicate rows
    if df_merged.duplicated(subset=['dateRep', 'location']).any():
        print_err("\n" + ERROR + " Found duplicate rows:")
        print_err(
            df_merged[df_merged.duplicated(subset=['dateRep', 'location'])])
        print_err(
            "\nPlease " +
            colored("fix or remove the duplicate rows", 'magenta') +
            " in the Excel file, and then save it again but under a new name, e.g. 2020-03-20-modified.xlsx"
        )
        print_err("Also please " +
                  colored("note down any changes you made", 'magenta') +
                  " in %s" %
                  os.path.abspath(os.path.join(INPUT_PATH, 'NOTES.md')))
        errors += 1

    # Check for missing population figures
    df_pop = load_population()
    pop_entity_diff = set(df_uniq['location']) - set(df_pop['location']) - set(
        ['International'])
    if len(pop_entity_diff) > 0:
        # this is not an error, so don't increment errors variable
        print("\n" + WARNING +
              " These entities were not found in the population dataset:")
        print(pop_entity_diff)
        print()
        formatted_msg = ", ".join(
            [f"`{entity}`" for entity in pop_entity_diff])
        send_warning(
            channel='corona-data-updates',
            title='Some entities are missing from the population dataset',
            message=formatted_msg)

    # Check for sudden changes
    sudden_changes_msg = ''
    for location, df_location in df_merged.groupby('location'):
        # Skip checks for "International"
        if location == 'International':
            continue

        for var_name in ['cases', 'deaths']:
            country_vals = df_location[var_name].dropna().values
            if len(country_vals) == 0:
                continue

            previous_RA = np.mean(country_vals[-8:-1])
            new_RA = np.mean(country_vals[-7:])

            if new_RA >= 1.5 * previous_RA and new_RA > 100:
                sudden_changes_msg += "<!> Sudden increase of *{}* in *{}*: {} (7-day average was {})\n".format(
                    var_name, location, int(country_vals[-1]),
                    int(previous_RA))

            new_val = country_vals[-1]

            if new_val < 0:
                sudden_changes_msg += f"<!> Negative number of *{var_name}* in *{location}*: {new_val}\n"

    if sudden_changes_msg:
        print(sudden_changes_msg)
        formatted_msg = sudden_changes_msg.replace('<!>', ':warning:')
        send_warning(channel='corona-data-updates',
                     title='Sudden changes in data',
                     message=formatted_msg)

    return True if errors == 0 else False