def main_get_data(paths, parallel: bool = False, n_jobs: int = -2, modules_name: list = modules_name, skip_countries: list = [], gsheets_api=None): """Get data from sources and export to output folder. Is equivalent to script `run_python_scripts.py` """ print("-- Getting data... --") skip_countries = [x.lower() for x in skip_countries] country_data_getter = CountryDataGetter(paths, skip_countries, gsheets_api) if parallel: modules_execution_results = Parallel(n_jobs=n_jobs, backend="threading")( delayed(country_data_getter.run)( module_name, ) for module_name in modules_name ) else: modules_execution_results = [] for module_name in modules_name: modules_execution_results.append(country_data_getter.run( module_name, )) modules_failed = [m["module_name"] for m in modules_execution_results if m["success"] is False] # Retry failed modules logger.info(f"\n---\n\nRETRIALS ({len(modules_failed)})") modules_execution_results = [] for module_name in modules_failed: modules_execution_results.append( country_data_getter.run(module_name) ) modules_failed_retrial = [m["module_name"] for m in modules_execution_results if m["success"] is False] if len(modules_failed_retrial) > 0: failed_str = "\n".join([f"* {m}" for m in modules_failed_retrial]) print(f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):\n{failed_str}") print_eoe()
def main_process_data(paths, gsheets_api, google_spreadsheet_vax_id: str, skip_complete: list = None, skip_monotonic: dict = {}, skip_anomaly: dict = {}): print("-- Processing data... --") # Get data from sheets logger.info("Getting data from Google Spreadsheet...") gsheet = VaccinationGSheet(gsheets_api, google_spreadsheet_vax_id) df_manual_list = gsheet.df_list() # Get automated-country data logger.info("Getting data from output...") automated = gsheet.automated_countries filepaths_auto = [paths.tmp_vax_out(country) for country in automated] df_auto_list = [pd.read_csv(filepath) for filepath in filepaths_auto] # Concatenate vax = df_manual_list + df_auto_list # Process locations def _process_location(df): monotonic_check_skip = skip_monotonic.get(df.loc[0, "location"], []) anomaly_check_skip = skip_anomaly.get(df.loc[0, "location"], []) return process_location(df, monotonic_check_skip, anomaly_check_skip) logger.info("Processing and exporting data...") vax_valid = [] for df in vax: if "location" not in df: raise ValueError(f"Column `location` missing. df: {df.tail(5)}") country = df.loc[0, "location"] if country.lower() not in skip_complete: df = _process_location(df) vax_valid.append(df) # Export df.to_csv(paths.pub_vax_loc(country), index=False) logger.info(f"{country}: SUCCESS ✅") else: logger.info(f"{country}: SKIPPED 🚧") df = pd.concat(vax_valid).sort_values(by=["location", "date"]) df.to_csv(paths.tmp_vax_all, index=False) gsheet.metadata.to_csv(paths.tmp_met_all, index=False) logger.info("Exported ✅") print_eoe()
def main_propose_data_twitter(paths, consumer_key: str, consumer_secret: str, parallel: bool = False, n_jobs: int = -2): """Get data from Twitter and propose it.""" print("-- Generating data proposals from Twitter sources... --") api = TwitterAPI(consumer_key, consumer_secret) if parallel: modules_execution_results = Parallel( n_jobs=n_jobs, backend="threading")(delayed(_propose_data_country)( api, module_name, paths, ) for module_name in modules_name) else: modules_execution_results = [] for module_name in modules_name: modules_execution_results.append( _propose_data_country( api, module_name, paths, )) modules_failed = [ m["module_name"] for m in modules_execution_results if m["success"] is False ] # Retry failed modules logger.info(f"\n---\n\nRETRIALS ({len(modules_failed)})") modules_execution_results = [] for module_name in modules_failed: modules_execution_results.append( _propose_data_country(api, module_name, paths)) modules_failed_retrial = [ m["module_name"] for m in modules_execution_results if m["success"] is False ] if len(modules_failed_retrial) > 0: failed_str = "\n".join([f"* {m}" for m in modules_failed_retrial]) print( f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):\n{failed_str}" ) print_eoe()
def main_process_data(paths, google_credentials: str, google_spreadsheet_vax_id: str, skip_complete: list = None, skip_monotonic: dict = {}, skip_anomaly: dict = {}): print("-- Processing data... --") # Get data from sheets logger.info("Getting data from Google Spreadsheet...") gsheet = GSheet(google_credentials, google_spreadsheet_vax_id) df_manual_list = gsheet.df_list() # Get automated-country data logger.info("Getting data from output...") automated = gsheet.automated_countries filepaths_auto = [paths.tmp_vax_out(country) for country in automated] df_auto_list = [pd.read_csv(filepath) for filepath in filepaths_auto] # Concatenate vax = df_manual_list + df_auto_list # Process locations def _process_location(df): monotonic_check_skip = skip_monotonic.get(df.loc[0, "location"], []) anomaly_check_skip = skip_anomaly.get(df.loc[0, "location"], []) return process_location(df, monotonic_check_skip, anomaly_check_skip) logger.info("Processing and exporting data...") vax = [ _process_location(df) for df in vax if df.loc[0, "location"].lower() not in skip_complete ] # Export for df in vax: country = df.loc[0, "location"] df.to_csv(paths.pub_vax_loc(country), index=False) df = pd.concat(vax).sort_values(by=["location", "date"]) df.to_csv(paths.tmp_vax_all, index=False) gsheet.metadata.to_csv(paths.tmp_met_all, index=False) logger.info("Exported ✅") print_eoe()
def main_get_data(paths, parallel: bool = False, n_jobs: int = -2, modules_name: list = modules_name, greece_api_token: str = None): """Get data from sources and export to output folder. Is equivalent to script `run_python_scripts.py` """ print("-- Getting data... --") if parallel: modules_execution_results = Parallel( n_jobs=n_jobs, backend="threading")(delayed(_get_data_country)(module_name, paths, greece_api_token) for module_name in modules_name) else: modules_execution_results = [] for module_name in modules_name: modules_execution_results.append( _get_data_country(module_name, paths, greece_api_token)) modules_failed = [ m["module_name"] for m in modules_execution_results if m["success"] is False ] # Retry failed modules logger.info(f"\n---\n\nRETRIALS ({len(modules_failed)})") modules_failed_retrial = [] for module_name in modules_failed: _get_data_country(module_name, paths, greece_api_token) if len(modules_failed_retrial) > 0: failed_str = "\n".join([f"* {m}" for m in modules_failed_retrial]) logger.warn( f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):{failed_str}" ) print_eoe()