def check_data_correctness(filename): errors = 0 df_merged = _load_merged(filename) df_uniq = df_merged[['countriesAndTerritories', 'geoId', 'location']].drop_duplicates() if df_uniq['location'].isnull().any(): print("\n" + ERROR + " Could not find OWID names for:") print(df_uniq[df_uniq['location'].isnull()]) csv_path = os.path.join(TMP_PATH, 'ecdc.csv') os.system('mkdir -p %s' % os.path.abspath(TMP_PATH)) df_uniq[['countriesAndTerritories']] \ .drop_duplicates() \ .rename(columns={'countriesAndTerritories': 'Country'}) \ .to_csv(csv_path, index=False) print("\nSaved CSV file to be standardized at %s. \nRun it through the OWID standardizer and save in %s" % ( colored(os.path.abspath(csv_path), 'magenta'), colored(os.path.abspath(LOCATIONS_CSV_PATH), 'magenta') )) errors += 1 # Drop missing locations for the further checks – that error is addressed above df_merged = df_merged.dropna(subset=['location']) if df_merged.duplicated(subset=['dateRep', 'location']).any(): print("\n" + ERROR + " Found duplicate rows:") print(df_merged[df_merged.duplicated(subset=['dateRep', 'location'])]) print("\nPlease " + colored("fix or remove the duplicate rows", 'magenta') + " in the Excel file, and then save it again but under a new name, e.g. 2020-03-20-modified.xlsx") print("Also please " + colored("note down any changes you made", 'magenta') + " in %s" % os.path.abspath(os.path.join(INPUT_PATH, 'NOTES.md'))) errors += 1 df_pop = load_population() pop_entity_diff = set(df_uniq['location']) - set(df_pop['location']) if len(pop_entity_diff) > 0: # this is not an error, so don't increment errors variable print("\n" + WARNING + " These entities were not found in the population dataset:") print(pop_entity_diff) print() return True if errors == 0 else False
def check_data_correctness(df_merged): errors = 0 # Check that every country name is standardized df_uniq = df_merged[["Country/Region", "location"]].drop_duplicates() if df_uniq["location"].isnull().any(): print_err("\n" + ERROR + " Could not find OWID names for:") print_err(df_uniq[df_uniq["location"].isnull()]) errors += 1 # Drop missing locations for the further checks – that error is addressed above df_merged = df_merged.dropna(subset=["location"]) # Check for duplicate rows if df_merged.duplicated(subset=["date", "location"]).any(): print_err("\n" + ERROR + " Found duplicate rows:") print_err(df_merged[df_merged.duplicated(subset=["date", "location"])]) errors += 1 # Check for missing population figures df_pop = load_population() pop_entity_diff = set(df_uniq["location"]) - set(df_pop["location"]) - set(["International"]) if len(pop_entity_diff) > 0: # this is not an error, so don't increment errors variable print("\n" + WARNING + " These entities were not found in the population dataset:") print(pop_entity_diff) print() formatted_msg = ", ".join([f"`{entity}`" for entity in pop_entity_diff]) send_warning( channel="corona-data-updates", title="Some entities are missing from the population dataset", message=formatted_msg ) return True if errors == 0 else False
def check_data_correctness(): errors = 0 df_merged = _load_merged() df_uniq = df_merged[['Countries and territories', 'GeoId', 'location']].drop_duplicates() df_pop = load_population() if df_uniq['location'].isnull().any(): print("Error: Could not find OWID names for:") print(df_uniq[df_uniq['location'].isnull()]) csv_path = os.path.join(TMP_PATH, 'ecdc_locations.csv') print(os.path.abspath(TMP_PATH)) os.system('mkdir -p %s' % os.path.abspath(TMP_PATH)) df_uniq \ .sort_values(by=['location']) \ .to_csv(csv_path, index=False) print("Saved CSV file to be standardized at %s" % csv_path) errors += 1 if df_merged.duplicated(subset=['DateRep', 'location']).any(): print("Found duplicate rows:") print(df_merged[df_merged.duplicated( subset=['DateRep', 'Our World in Data'])]) errors += 1 pop_entity_diff = set(df_uniq['location']) - set(df_pop['location']) if len(pop_entity_diff) > 0: # this is not an error, so don't increment errors variable print("These entities were not found in the population dataset:") print(pop_entity_diff) return True if errors == 0 else False
def check_data_correctness(filename): errors = 0 df_merged = _load_merged(filename) # Check that every country name is standardized df_uniq = df_merged[['countriesAndTerritories', 'geoId', 'location']].drop_duplicates() if df_uniq['location'].isnull().any(): print_err("\n" + ERROR + " Could not find OWID names for:") print_err(df_uniq[df_uniq['location'].isnull()]) csv_path = os.path.join(TMP_PATH, 'ecdc.csv') os.system('mkdir -p %s' % os.path.abspath(TMP_PATH)) df_uniq[['countriesAndTerritories']] \ .drop_duplicates() \ .rename(columns={'countriesAndTerritories': 'Country'}) \ .to_csv(csv_path, index=False) print_err( "\nSaved CSV file to be standardized at %s. \nRun it through the OWID standardizer and save in %s" % (colored(os.path.abspath(csv_path), 'magenta'), colored(os.path.abspath(LOCATIONS_CSV_PATH), 'magenta'))) errors += 1 # Drop missing locations for the further checks – that error is addressed above df_merged = df_merged.dropna(subset=['location']) # Check for duplicate rows if df_merged.duplicated(subset=['dateRep', 'location']).any(): print_err("\n" + ERROR + " Found duplicate rows:") print_err( df_merged[df_merged.duplicated(subset=['dateRep', 'location'])]) print_err( "\nPlease " + colored("fix or remove the duplicate rows", 'magenta') + " in the Excel file, and then save it again but under a new name, e.g. 2020-03-20-modified.xlsx" ) print_err("Also please " + colored("note down any changes you made", 'magenta') + " in %s" % os.path.abspath(os.path.join(INPUT_PATH, 'NOTES.md'))) errors += 1 # Check for missing population figures df_pop = load_population() pop_entity_diff = set(df_uniq['location']) - set(df_pop['location']) - set( ['International']) if len(pop_entity_diff) > 0: # this is not an error, so don't increment errors variable print("\n" + WARNING + " These entities were not found in the population dataset:") print(pop_entity_diff) print() formatted_msg = ", ".join( [f"`{entity}`" for entity in pop_entity_diff]) send_warning( channel='corona-data-updates', title='Some entities are missing from the population dataset', message=formatted_msg) # Check for sudden changes sudden_changes_msg = '' for location, df_location in df_merged.groupby('location'): # Skip checks for "International" if location == 'International': continue for var_name in ['cases', 'deaths']: country_vals = df_location[var_name].dropna().values if len(country_vals) == 0: continue previous_RA = np.mean(country_vals[-8:-1]) new_RA = np.mean(country_vals[-7:]) if new_RA >= 1.5 * previous_RA and new_RA > 100: sudden_changes_msg += "<!> Sudden increase of *{}* in *{}*: {} (7-day average was {})\n".format( var_name, location, int(country_vals[-1]), int(previous_RA)) new_val = country_vals[-1] if new_val < 0: sudden_changes_msg += f"<!> Negative number of *{var_name}* in *{location}*: {new_val}\n" if sudden_changes_msg: print(sudden_changes_msg) formatted_msg = sudden_changes_msg.replace('<!>', ':warning:') send_warning(channel='corona-data-updates', title='Sudden changes in data', message=formatted_msg) return True if errors == 0 else False