def test_file_index_creation(): reset_file_index_cache() # Existing combination of parameters file_index = create_file_index_for_dwd_server( Parameter.CLIMATE_SUMMARY, TimeResolution.DAILY, PeriodType.RECENT ) assert not file_index.empty reset_file_index_cache() file_index2 = create_file_index_for_dwd_server( Parameter.CLIMATE_SUMMARY, TimeResolution.DAILY, PeriodType.RECENT ) assert file_index.equals(file_index2) assert file_index.loc[ file_index[DWDMetaColumns.STATION_ID.value] == 1048, DWDMetaColumns.FILENAME.value, ].values.tolist() == ["daily/kl/recent/tageswerte_KL_01048_akt.zip"] with pytest.raises(requests.exceptions.HTTPError): create_file_index_for_dwd_server( Parameter.CLIMATE_SUMMARY, TimeResolution.MINUTE_1, PeriodType.HISTORICAL )
def create_file_list_for_dwd_server( station_ids: List[int], parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], create_new_file_index: bool = False, ) -> List[str]: """ Function for selecting datafiles (links to archives) for given station_ids, parameter, time_resolution and period_type under consideration of a created list of files that are available online. Args: station_ids: ids for the weather station to ask for data parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files create_new_file_index: set if new file index is created Returns: List of path's to file """ if create_new_file_index: reset_file_index_cache() parameter = Parameter(parameter) time_resolution = TimeResolution(time_resolution) period_type = PeriodType(period_type) file_index = create_file_index_for_dwd_server(parameter, time_resolution, period_type) file_index = file_index[file_index[DWDMetaColumns.STATION_ID.value].isin( station_ids)] return file_index[DWDMetaColumns.FILENAME.value].values.tolist()
def metadata_for_climate_observations( parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], create_new_meta_index: bool = False, create_new_file_index: bool = False, ) -> pd.DataFrame: """ A main function to retrieve metadata for a set of parameters that creates a corresponding csv. STATE information is added to metadata for cases where there's no such named column (e.g. STATE) in the pandas.DataFrame. For this purpose we use daily precipitation data. That has two reasons: - daily precipitation data has a STATE information combined with a city - daily precipitation data is the most common data served by the DWD Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files create_new_meta_index: if true: a new meta index for metadata will be created create_new_file_index: if true: a new file index for metadata will be created Returns: pandas.DataFrame with metadata for selected parameters """ if create_new_meta_index: reset_meta_index_cache() if create_new_file_index: reset_file_index_cache() parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) meta_index = create_meta_index_for_climate_observations( parameter, time_resolution, period_type) meta_index[DWDMetaColumns.HAS_FILE.value] = False file_index = create_file_index_for_climate_observations( parameter, time_resolution, period_type) meta_index.loc[meta_index.loc[:, DWDMetaColumns.STATION_ID.value]. isin(file_index[DWDMetaColumns.STATION_ID.value]), DWDMetaColumns.HAS_FILE.value, ] = True return meta_index
def collect_data(self) -> Generator[pd.DataFrame, None, None]: """ Method to collect data for a defined request. The function is build as generator in order to not cloak the memory thus if the user wants the data as one pandas DataFrame the generator has to be casted to a DataFrame manually via pd.concat(list(request.collect_data([...])). Args: same as init Returns: via a generator per station a pandas.DataFrame """ if self.create_new_file_index: reset_file_index_cache() for station_id in self.station_ids: df_station = pd.DataFrame() for parameter in self.parameter: df_parameter_period = pd.DataFrame() for period_type in self.period_type: try: df_period = collect_dwd_data( station_ids=[station_id], parameter=parameter, time_resolution=self.time_resolution, period_type=period_type, folder=self.folder, prefer_local=self.prefer_local, write_file=self.write_file, tidy_data=self.tidy_data, humanize_column_names=self.humanize_column_names, create_new_file_index=False, ) except InvalidParameterCombination: log.info( f"Combination for " f"{parameter.value}/" f"{self.time_resolution.value}/" f"{period_type} does not exist and is skipped." ) continue # Filter out values which already are in the DataFrame try: df_period = df_period[ ~df_period[DWDMetaColumns.DATE.value].isin( df_parameter_period[DWDMetaColumns.DATE.value] ) ] except KeyError: pass df_parameter_period = df_parameter_period.append( df_period, ignore_index=True ) df_station = df_station.append(df_parameter_period, ignore_index=True) # Filter for dates range if start_date and end_date are defined if self.start_date: df_station = df_station[ (df_station[DWDMetaColumns.DATE.value] >= self.start_date) & (df_station[DWDMetaColumns.DATE.value] <= self.end_date) ] # Empty dataframe should be skipped if df_station.empty: continue yield df_station
def collect_climate_observations_data( station_ids: List[int], parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], folder: Union[str, Path] = DWD_FOLDER_MAIN, prefer_local: bool = False, write_file: bool = False, tidy_data: bool = True, humanize_column_names: bool = False, run_download_only: bool = False, create_new_file_index: bool = False, ) -> Optional[pd.DataFrame]: """ Function that organizes the complete pipeline of data collection, either from the internet or from a local file. It therefor goes through every given station id and, given by the parameters, either tries to get data from local store and/or if fails tries to get data from the internet. Finally if wanted it will try to store the data in a hdf file. Args: station_ids: station ids that are trying to be loaded parameter: parameter as enumeration time_resolution: time resolution as enumeration period_type: period type as enumeration folder: folder for local file interaction prefer_local: boolean for if local data should be preferred write_file: boolean to write data to local storage tidy_data: boolean to tidy up data so that there's only one set of values for a datetime in a row e.g. station_id, parameter, element, datetime, value, quality humanize_column_names: boolean to yield column names better for human consumption run_download_only: boolean to run only the download and storing process create_new_file_index: boolean if to create a new file index for the data selection Returns: a pandas DataFrame with all the data given by the station ids """ parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) if not check_parameters(parameter, time_resolution, period_type): raise InvalidParameterCombination( f"The combination of {parameter.value}, {time_resolution.value}, " f"{period_type.value} is invalid." ) if create_new_file_index: reset_file_index_cache() # List for collected pandas DataFrames per each station id data = [] for station_id in set(station_ids): request_string = _build_local_store_key( station_id, parameter, time_resolution, period_type ) if prefer_local: # Try restoring data station_data = restore_climate_observations( station_id, parameter, time_resolution, period_type, folder ) # When successful append data and continue with next iteration if not station_data.empty: log.info(f"Data for {request_string} restored from local.") data.append(station_data) continue log.info(f"Data for {request_string} will be collected from internet.") remote_files = create_file_list_for_climate_observations( [station_id], parameter, time_resolution, period_type ) if len(remote_files) == 0: log.info(f"No files found for {request_string}. Station will be skipped.") continue filenames_and_files = download_climate_observations_data_parallel(remote_files) station_data = parse_climate_observations_data( filenames_and_files, parameter, time_resolution ) if write_file: store_climate_observations( station_data, station_id, parameter, time_resolution, period_type, folder, ) data.append(station_data) if run_download_only: return None try: data = pd.concat(data) except ValueError: return pd.DataFrame() if tidy_data: data = _tidy_up_data(data, parameter) # Assign meaningful column names (humanized). if humanize_column_names: hcnm = create_humanized_column_names_mapping(time_resolution, parameter) if tidy_data: data[DWDMetaColumns.ELEMENT.value] = data[ DWDMetaColumns.ELEMENT.value ].apply(lambda x: hcnm[x]) else: data = data.rename(columns=hcnm) return data