def get_data_items(cls, data_items='all', regions='ES', start_date=None, end_date=None, language='ES', errors='ignore'): """ Collects the required Data Items from associated Data Sources Parameters ---------- data_items : list of str list of data item names. By default, 'all' are collected. regions : list of str list of region names. By default, 'ES' refers to all Spanish regions. start_date : pd.datetime first day to be considered in TEMPORAL data items. By default, None is established. end_date : pd.datetime last day to be considered in TEMPORAL data items. By default, None is established. language : str language of the returned data. 'ES' for Spanish (default value), 'EN' for English. errors : str action to be taken when errors occur. 'ignore' tries to get all possible data items even if some can't be collected, 'raise' throws an exception and the execution is aborted upon detection of any error. Returns ------- pd.DataFrame a DataFrame with the required information. Notes ----- If dates are passed, then it is assumed that TEMPORAL data items are required. Otherwise, a GEOGRAPHICAL retrieval is assumed. A TEMPORAL retrieval produces a DataFrame with daily [Date] as row indexer and [Region, Data Item] as column multiindexer. A GEOGRAPHICAL retrieval produces a DataFrame with [Region] as row indexer and [Data Item] as column indexer. """ # if data sources are not initialized, lets read configurations if not cls.__DATA_SOURCES_INITIALIZED: cls.__init_data_sources() ##### check of parameters ##### if data_items == 'all': data_items = cls.get_data_items_names(data_type=None, language=language) else: ## check if items are implemented ## # get all implemented items implemented_data_sources = cls.get_data_items_names( data_type=None, language=language) implemented_data_items = [] for implemented_data_source in list( implemented_data_sources.keys()): implemented_data_items = implemented_data_items + implemented_data_sources[ implemented_data_source] # check successful_data_items = [] for data_item in data_items: if data_item not in implemented_data_items: print(f'WARNING: Item {data_item} is not implemented') else: successful_data_items.append(data_item) if not successful_data_items: print( 'WARNING: No result found for the specified data items and conditions' ) return None data_items = successful_data_items if regions == 'ES': regions = Regions.get_regions('ES') if start_date is None or end_date is None: assumed_data_type = DataType.GEOGRAPHICAL else: assumed_data_type = DataType.TEMPORAL print("Assumed a " + str(assumed_data_type.name) + " data retrieval...") if assumed_data_type is DataType.TEMPORAL: if start_date > end_date: print('ERROR: start_date (' + str(start_date) + ') should be smaller or equal than end_date (' + str(start_date) + ')') return None if end_date > pd.to_datetime('today').date(): print('ERROR: end_date (' + str(end_date) + ') should not refer to the future') return None ### change data items (display names) to internal representation ### internalname_displayname_dict = cls._get_internal_names_mapping( assumed_data_type, data_items, language=language ) # get internal name - display name dict (then is used to rename again) if internalname_displayname_dict is None: return None data_items = list(internalname_displayname_dict.keys() ) # change data_items to internal representation ### group data items by data source in dictionary ### # existing items for assumed data type items_by_source = cls.get_data_items_names( data_type=assumed_data_type, language=language) # dict with : source -> [item1, item2] items_by_assumed_data_type = [] for items in items_by_source.values(): items_by_assumed_data_type = items_by_assumed_data_type + items # group requested items by data sources requested_items_by_source = defaultdict( list ) # dict datasource : [requested item 1, requested item 2, ...] for data_item in data_items: source_class_found = False source = 0 while source < len( cls.__DATA_SOURCE_CLASSES) and not source_class_found: source_class_found = cls.__DATA_SOURCE_CLASSES[ source].data_item_exists(data_item) source = source + 1 if source_class_found: requested_items_by_source[cls.__DATA_SOURCE_CLASSES[ source - 1]].append(data_item) else: # never should get there print('WARNING: Data source not found for item \'' + str(data_item) + '\'') ##### data retrieval ##### df_all_data_sources = None ## get data by data source ## for DATA_SOURCE_CLASS in requested_items_by_source.keys(): df_data_source = None data_items = requested_items_by_source[DATA_SOURCE_CLASS] # for temporal data type if assumed_data_type is DataType.TEMPORAL: df_data_source = DATA_SOURCE_CLASS(data_items, regions, start_date, end_date).get_data(errors) if df_data_source is not None: df_data_source = cls.__complete_dates( df_data_source, start_date, end_date ) # complete with nan values those days without info # for geographical data type elif assumed_data_type is DataType.GEOGRAPHICAL: df_data_source = DATA_SOURCE_CLASS(data_items, regions).get_data(errors) else: # never should get here return None # continuous joining of data from diverse data sources if df_data_source is not None: if df_all_data_sources is None: df_all_data_sources = df_data_source.sort_index(axis=1) else: df_all_data_sources = pd.concat( [df_all_data_sources, df_data_source], axis='columns').sort_index(axis=1) ## END: get data by data source ## if df_all_data_sources is None: print( 'WARNING: No result found for the specified data items and conditions' ) return None def rename_with_regex(col_name): for internal_name in list(internalname_displayname_dict.keys()): if re.match(f"^{internal_name}$|^{internal_name} \(", col_name): return re.sub( pattern=internal_name, repl=internalname_displayname_dict[internal_name], string=col_name) return 'None' df_all_data_sources.rename(columns=rename_with_regex, level='Item', inplace=True) ### filter retrieved data to match the specific query determined by data_items, regions and dates ### # filter requested data_items (some data sources request in the same query more data items than the ones requested ) df_all_data_sources = df_all_data_sources.loc[:, df_all_data_sources. columns.get_level_values( 'Item') != 'None'] if assumed_data_type is DataType.TEMPORAL: df_all_data_sources = df_all_data_sources[ (df_all_data_sources.index >= start_date) & (df_all_data_sources.index <= end_date)] # to filter requested dates (indexes) df_all_data_sources = df_all_data_sources.loc[:, df_all_data_sources. columns. get_level_values( 'Region' ).isin( regions )] # to filter dates (indexes) else: df_all_data_sources = df_all_data_sources[ df_all_data_sources.index.isin( regions)] # to filter requested regions (indexes) df_all_data_sources = df_all_data_sources.loc[:, ~df_all_data_sources. columns.duplicated()] return df_all_data_sources
def daily_update(cls) -> bool: """ Checks which data source should be refreshed and accordingly updates the Data Cache (which is loaded in memory) FROM the last day cached minus the number of days indicated in class attribute __UPDATE_DAYS UNTIL today. This method removes the outdated file and creates the up-to-date file in the data path (class attribute__DATA_PATH) with the filename `cache_YYYY-MM-DD.h5` of today. Returns ------- boolean True if the update was done, False otherwise. Notes ---- * If this method notices that the cache filename corresponds to the date of today, it assumes that the Data Cache is up-to-date and nothing more is performed. * This function updates the Data Cache on disk, but load_data() function should be executed afterwards to perform the update in memory and, in turn, enable up-to-date queries. """ # date of today today = pd.to_datetime( pd.to_datetime('today').strftime(format='%Y-%m-%d')) # check if daily update has been done before try: for file in os.listdir(cls.__DATA_PATH): if re.match(f"cache_{str(today)[0:10]}.h5", file): cls.__LOGGER.info( f"Daily update avoided, the cache is up-to-date (today file cache_{str(today)[0:10]}.h5 already exists)" ) return True except Exception as e: cls.__LOGGER.exception(f"ERROR finding cache file", str(e)) return False # check which data sources should be updated datasources_to_update = [] dsi = COnVIDa._get_update_frequencies() for ds in dsi.keys(): if cls.__LAST_UPDATE_TIMESTAMPS is None: datasources_to_update.append(ds) else: if cls.__LAST_UPDATE_TIMESTAMPS.loc[ds, 'last_update'] is None: datasources_to_update.append(ds) else: days_without_updating = ( today - cls.__LAST_UPDATE_TIMESTAMPS.loc[ds, 'last_update']).days if days_without_updating >= dsi[ds]: datasources_to_update.append(ds) if not datasources_to_update: cls.__LOGGER.info("No source is out of date") return True # all regions all_regions = Regions.get_regions('ES') # new cache file new_cache_file = os.path.join(cls.__DATA_PATH, "cache_{}.h5".format(str(today)[0:10])) # last cache file last_cache_file = cls.__CACHE_PATH ####### GEOGRAPHICAL UPDATE ####### # all data items try: datasources = COnVIDa.get_data_items_names(DataType.GEOGRAPHICAL, language='internal') all_data_items = [] for datasource in datasources.keys(): if datasource in datasources_to_update: for data_item in datasources[datasource]: all_data_items.append(data_item) if not all_data_items: new_geodata = None else: new_geodata = COnVIDa.get_data_items(regions=all_regions, data_items=all_data_items, language='internal', errors='raise') except Exception as e: cls.__LOGGER.exception( "Retrieval of geographical data in daily update failed: ", str(e)) return False ####### TEMPORAL UPDATE ####### # all data items try: datasources = COnVIDa.get_data_items_names(DataType.TEMPORAL, language='internal') all_data_items = [] for datasource in datasources.keys(): if datasource in datasources_to_update: for data_item in datasources[datasource]: all_data_items.append(data_item) if not all_data_items: new_tempdata = None else: last_date = cls.__DATA[DataType.TEMPORAL].index[-1] start_date = last_date - pd.DateOffset(days=cls.__UPDATE_DAYS) # get updated data of last days and today new_data = COnVIDa.get_data_items(regions=all_regions, data_items=all_data_items, start_date=start_date, end_date=today, language='internal', errors='raise') # update cache new_tempdata = cls.__DATA[DataType.TEMPORAL] new_tempdata = new_tempdata.append(new_data) new_tempdata = new_tempdata.loc[~new_tempdata.index.duplicated( keep='last')] except Exception as e: cls.__LOGGER.exception( "Retrieval of temporal data in daily update failed: ", str(e)) return False ####### COMPLETE UPDATE IF NEW DATA IS AVAILABLE ########## try: if new_geodata is not None: new_geodata.to_hdf(path_or_buf=new_cache_file, key='geographical', mode='a') else: cls.__DATA[DataType.GEOGRAPHICAL].to_hdf( path_or_buf=new_cache_file, key='geographical', mode='a') if new_tempdata is not None: new_tempdata.to_hdf(path_or_buf=new_cache_file, key='temporal', mode='a') else: cls.__DATA[DataType.TEMPORAL].to_hdf( path_or_buf=new_cache_file, key='temporal', mode='a') if cls.__LAST_UPDATE_TIMESTAMPS is None: cls.__LAST_UPDATE_TIMESTAMPS.loc[:, "last_update"] = today else: cls.__LAST_UPDATE_TIMESTAMPS.loc[ cls.__LAST_UPDATE_TIMESTAMPS.index. isin(datasources_to_update), "last_update"] = today cls.__LAST_UPDATE_TIMESTAMPS.to_hdf(path_or_buf=new_cache_file, key='last_updates', mode='a') except Exception as e: if os.path.exists(new_cache_file): os.remove(new_cache_file ) # remove created cache if daily update fail cls.__LOGGER.exception( "Creation of new cache file in daily update failed: ", str(e)) return False # if the process has been stably completed, lets remove old cache # at this point, both old and new cache exist and new cache is in memory try: # at this point, both the old and new cache should exist: lets remove the old one if os.path.exists(new_cache_file) and os.path.exists( last_cache_file): os.remove(last_cache_file ) # remove created cache if daily update fail cls.__LOGGER.info("Daily update done!") return True except Exception as e: cls.__LOGGER.exception("Error in removing old cache file: ", str(e)) ### should never get here ### try: # if error in removing old cache file, lets recover old status if os.path.exists(new_cache_file): os.remove(new_cache_file) cls.load_data(old_cache_file) except Exception as e: cls.__LOGGER.info( "Critical fail in daily update: it was not possible to recover old status" ) return False