def create_file_list_for_dwd_server(statid: List[int], parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = MAIN_FOLDER, create_new_filelist=False) -> List[str]: """ Function for selecting datafiles (links to archives) for given statid, parameter, time_resolution and period_type under consideration of a created list of files that are available online. Args: statid: id for the weather station to ask for data parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files folder: create_new_filelist: boolean for checking existing file list or not Returns: List of path's to file """ # Check type of function parameters assert isinstance(statid, list) assert isinstance(parameter, Parameter) assert isinstance(time_resolution, TimeResolution) assert isinstance(period_type, PeriodType) assert isinstance(folder, str) assert isinstance(create_new_filelist, bool) # Check for the combination of requested parameters check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type) folder = correct_folder_path(folder) # Create name of fileslistfile filelist_local = f'{FILELIST_NAME}_{parameter.value}_' \ f'{time_resolution.value}_{period_type.value}' # Create filepath to filelist in folder filelist_local_path = Path(folder, SUB_FOLDER_METADATA, filelist_local) filelist_local_path = f"{filelist_local_path}{DATA_FORMAT}" if create_new_filelist or not Path(filelist_local_path).is_file(): # If there was an error with reading in the fileslist get a new # fileslist create_fileindex(parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder) filelist = pd.read_csv(filelist_local_path) return filelist.loc[filelist[STATION_ID_NAME].isin(statid), FILENAME_NAME].tolist()
def get_nearest_station(latitudes: Union[List[float], np.array], longitudes: Union[List[float], np.array], parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], num_stations_nearby: int = 1) -> \ Tuple[List[int], List[float]]: """ Provides a list of weather station ids for the requested data Args: latitudes: latitudes of locations to search for nearest weather station longitudes: longitudes of locations to search for nearest weather station parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files num_stations_nearby: Number of stations that should be nearby Returns: list of stations ids for the given locations/coordinate pairs and a list of distances in kilometer to the weather station """ parameter = Parameter(parameter) time_resolution = TimeResolution(time_resolution) period_type = PeriodType(period_type) if not isinstance(latitudes, list): latitudes = np.array(latitudes) if not isinstance(longitudes, list): latitudes = np.array(longitudes) check_parameters(parameter, time_resolution, period_type) coords = Coordinates(latitudes, longitudes) metadata = metadata_for_dwd_data(parameter, time_resolution, period_type) distances, indices_nearest_neighbours = _derive_nearest_neighbours( metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby) if np.max(indices_nearest_neighbours.shape) > 1: indices_nearest_neighbours = indices_nearest_neighbours[0] return metadata.loc[indices_nearest_neighbours, 'STATION_ID'].tolist(),\ (distances * KM_EARTH_RADIUS).tolist()
def get_nearest_station(latitudes: Union[List[float], np.array], longitudes: Union[List[float], np.array], parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType) -> \ Tuple[List[int], List[float]]: """ Provides a list of weather station ids for the requested data Args: latitudes: latitudes of locations to search for nearest weather station longitudes: longitudes of locations to search for nearest weather station parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files Returns: list of stations ids for the given locations/coordinate pairs and a list of distances in kilometer to the weather station """ if not isinstance(latitudes, list): latitudes = np.array(latitudes) if not isinstance(longitudes, list): latitudes = np.array(longitudes) check_parameters(parameter, time_resolution, period_type) coords = Coordinates(latitudes, longitudes) metadata = metadata_for_dwd_data(parameter, time_resolution, period_type) distances, indices_nearest_neighbours = derive_nearest_neighbours( metadata.LAT.values, metadata.LON.values, coords) return metadata.loc[indices_nearest_neighbours, 'STATION_ID'].tolist(),\ (distances * KM_EARTH_RADIUS).tolist()
def __init__(self, station_id: Union[str, int, List[Union[int, str]]], parameter: Union[str, Parameter], time_resolution: Union[str, TimeResolution], period_type: Union[None, str, list, PeriodType] = None, start_date: Union[None, str, Timestamp] = None, end_date: Union[None, str, Timestamp] = None) -> None: if not (period_type or (start_date and end_date)): raise ValueError("Define either a 'time_resolution' or both the 'start_date' and 'end_date' and " "leave the other one empty!") if not all(isinstance(x, int) for x in station_id): raise ValueError("List of station id's contains none integer values or is at least not given as a list") self.station_id = [int(s) for s in cast_to_list(station_id)] self.parameter = parameter if isinstance(parameter, Parameter) \ else _parse_parameter_from_value(parameter, PARAMETER_WORDLIST_MAPPING) self.time_resolution = time_resolution if isinstance(time_resolution, TimeResolution) \ else _parse_parameter_from_value(time_resolution, TIMERESOLUTION_WORDLIST_MAPPING) self.period_type = cast_to_list(period_type) if isinstance(period_type, (PeriodType, type(None))) \ else [_parse_parameter_from_value(period_type, PERIODTYPE_WORDLIST_MAPPING) for period_type in cast_to_list(period_type)] self.start_date = parse_date(start_date) self.end_date = parse_date(end_date) if self.start_date: # working with ranges of data means expecting data to be laying between periods, thus including all self.period_type = [PeriodType.HISTORICAL, PeriodType.RECENT, PeriodType.NOW] if not self.start_date <= self.end_date: raise StartDateEndDateError for period_type in self.period_type.copy(): if not check_parameters(parameter=self.parameter, time_resolution=self.time_resolution, period_type=period_type): print(f"Combination of: parameter {self.parameter.value}, " f"time_resolution {self.time_resolution.value}, " f"period_type {period_type} not available and removed.") self.period_type.remove(period_type) # Use the clean up of self.period_type to identify if there's any data with those parameters if not self.period_type: raise ValueError("Error: no combination for parameter, time_resolution and period_type could be found.")
def metadata_for_dwd_data(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = MAIN_FOLDER, write_file: bool = True, create_new_filelist: bool = False): """ A main function to retrieve metadata for a set of parameters that creates a corresponding csv. STATE information is added to metadata for cases where there's no such named column (e.g. STATE) in the dataframe. For this purpose we use daily precipitation data. That has two reasons: - daily precipitation data has a STATE information combined with a city - daily precipitation data is the most common data served by the DWD Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files folder: local file system folder where files should be stored write_file: writes the meta data file to the local file system create_new_filelist: if true: a new file_list for metadata will be created Returns: """ assert isinstance(parameter, Parameter) assert isinstance(time_resolution, TimeResolution) assert isinstance(period_type, PeriodType) assert isinstance(folder, str) assert isinstance(write_file, bool) assert isinstance(create_new_filelist, bool) check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type) file_path = create_metainfo_fpath(folder, parameter, period_type, time_resolution) if check_file_exist(file_path) and not create_new_filelist: metainfo = pd.read_csv(filepath_or_buffer=file_path) return metainfo if time_resolution != TimeResolution.MINUTE_1: metainfo = create_metaindex(parameter=parameter, time_resolution=time_resolution, period_type=period_type) else: metainfo = metaindex_for_1minute_data(parameter=parameter, time_resolution=time_resolution, folder=folder) if STATE_NAME not in metainfo.columns: mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE, TimeResolution.DAILY, PeriodType.HISTORICAL, folder=folder, write_file=False, create_new_filelist=False) metainfo = metainfo.merge(mdp.loc[:, [STATIONNAME_NAME, STATE_NAME]], on=STATIONNAME_NAME).reset_index(drop=True) metainfo = add_filepresence(metainfo=metainfo, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, create_new_filelist=create_new_filelist) if write_file and not check_file_exist(file_path) and not \ create_new_filelist: remove_old_file(file_type=METADATA_NAME, file_postfix=DATA_FORMAT, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, subfolder=SUB_FOLDER_METADATA) metainfo.to_csv(path_or_buf=file_path, header=True, index=False) return metainfo
def test_check_parameters(): assert check_parameters(Parameter.PRECIPITATION, TimeResolution.MINUTE_10, PeriodType.HISTORICAL) is None
def create_file_list_for_dwd_server(station_ids: List[int], parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = DWD_FOLDER_MAIN, create_new_filelist=False) -> pd.DataFrame: """ Function for selecting datafiles (links to archives) for given station_ids, parameter, time_resolution and period_type under consideration of a created list of files that are available online. Args: station_ids: id(s) for the weather station to ask for data parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files folder: create_new_filelist: boolean for checking existing file list or not Returns: List of path's to file """ # Check type of function parameters assert isinstance(station_ids, list) station_ids = [int(statid) for statid in station_ids] assert isinstance(parameter, Parameter) assert isinstance(time_resolution, TimeResolution) assert isinstance(period_type, PeriodType) assert isinstance(folder, str) assert isinstance(create_new_filelist, bool) # Check for the combination of requested parameters check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type) folder = correct_folder_path(folder) # Create name of fileslistfile filelist_local = f'{FILELIST_NAME}_{parameter.value}_' \ f'{time_resolution.value}_{period_type.value}' # Create filepath to filelist in folder filelist_local_path = Path(folder, DWD_FOLDER_METADATA, filelist_local) filelist_local_path = f"{filelist_local_path}{DATA_FORMAT}" if create_new_filelist or not Path(filelist_local_path).is_file(): create_fileindex(parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder) filelist = pd.read_csv(filepath_or_buffer=filelist_local_path, sep=",", dtype={DWDColumns.FILEID.value: int, DWDColumns.STATION_ID.value: int, DWDColumns.FILENAME.value: str}) return filelist.loc[filelist[DWDColumns.STATION_ID.value].isin(station_ids), :]
def __init__(self, station_ids: Union[str, int, List[Union[int, str]]], parameter: Union[str, Parameter], time_resolution: Union[str, TimeResolution], period_type: Union[None, str, list, PeriodType] = None, start_date: Union[None, str, Timestamp] = None, end_date: Union[None, str, Timestamp] = None, humanize_column_names: bool = False) -> None: if not (period_type or (start_date and end_date)): raise ValueError("Define either a 'time_resolution' or both the 'start_date' and 'end_date' and " "leave the other one empty!") try: self.station_ids = [int(station_id) for station_id in cast_to_list(station_ids)] except ValueError: raise ValueError("List of station id's can not be parsed to integers.") try: self.parameter = Parameter(parameter) except ValueError: self.parameter = _parse_parameter_from_value( parameter, PARAMETER_WORDLIST_MAPPING) try: self.time_resolution = TimeResolution(time_resolution) except ValueError: self.time_resolution = _parse_parameter_from_value( time_resolution, TIMERESOLUTION_WORDLIST_MAPPING) self.period_type = [] for pt in cast_to_list(period_type): if pt is None: self.period_type.append(None) continue try: self.period_type.append(PeriodType(pt)) except ValueError: self.period_type.append( _parse_parameter_from_value(period_type, PERIODTYPE_WORDLIST_MAPPING)) # Additional sorting required for self.period_type to ensure that for multiple # periods the data is first sourced from historical self.period_type = sorted(self.period_type) self.start_date = parse_date(start_date) self.end_date = parse_date(end_date) if self.start_date: # working with ranges of data means expecting data to be laying between periods, thus including all self.period_type = [PeriodType.HISTORICAL, PeriodType.RECENT, PeriodType.NOW] if not self.start_date <= self.end_date: raise StartDateEndDateError("Error: 'start_date' must be smaller or equal to 'end_date'.") for period_type in self.period_type.copy(): if not check_parameters(parameter=self.parameter, time_resolution=self.time_resolution, period_type=period_type): log.info(f"Combination of: parameter {self.parameter.value}, " f"time_resolution {self.time_resolution.value}, " f"period_type {period_type} not available and removed.") self.period_type.remove(period_type) # Use the clean up of self.period_type to identify if there's any data with those parameters if not self.period_type: raise ValueError("No combination for parameter, time_resolution " "and period_type could be found.") self.humanize_column_names = humanize_column_names
def metadata_for_dwd_data(parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], folder: str = DWD_FOLDER_MAIN, write_file: bool = True, create_new_file_index: bool = False) -> pd.DataFrame: """ A main function to retrieve metadata for a set of parameters that creates a corresponding csv. STATE information is added to metadata for cases where there's no such named column (e.g. STATE) in the pandas.DataFrame. For this purpose we use daily precipitation data. That has two reasons: - daily precipitation data has a STATE information combined with a city - daily precipitation data is the most common data served by the DWD Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files folder: local file system folder where files should be stored write_file: writes the meta data file to the local file system create_new_file_index: if true: a new file_list for metadata will be created Returns: pandas.DataFrame with metadata for selected parameters """ if create_new_file_index: reset_file_index_cache() parameter = Parameter(parameter) time_resolution = TimeResolution(time_resolution) period_type = PeriodType(period_type) check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type) file_path = create_metainfo_fpath(folder, parameter, period_type, time_resolution) if time_resolution == TimeResolution.MINUTE_1: metainfo = metaindex_for_1minute_data(parameter=parameter, time_resolution=time_resolution) else: metainfo = create_metaindex(parameter=parameter, time_resolution=time_resolution, period_type=period_type) if all(pd.isnull(metainfo[DWDMetaColumns.STATE.value])): # @todo avoid calling function in function -> we have to build a function around to manage missing data mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE, TimeResolution.DAILY, PeriodType.HISTORICAL, create_new_file_index=False) stateinfo = pd.merge(metainfo[DWDMetaColumns.STATION_ID], mdp.loc[:, [DWDMetaColumns.STATION_ID.value, DWDMetaColumns.STATE.value]], how="left") metainfo[DWDMetaColumns.STATE.value] = stateinfo[DWDMetaColumns.STATE.value] metainfo = add_filepresence(metainfo=metainfo, parameter=parameter, time_resolution=time_resolution, period_type=period_type) if write_file and not file_path.is_file() and create_new_file_index: remove_old_file(file_type=METADATA_NAME, file_postfix=DATA_FORMAT, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, subfolder=DWD_FOLDER_METADATA) metainfo.to_csv(path_or_buf=file_path, header=True, index=False) return metainfo
def metadata_for_dwd_data(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = DWD_FOLDER_MAIN, write_file: bool = True, create_new_filelist: bool = False) -> pd.DataFrame: """ A main function to retrieve metadata for a set of parameters that creates a corresponding csv. STATE information is added to metadata for cases where there's no such named column (e.g. STATE) in the dataframe. For this purpose we use daily precipitation data. That has two reasons: - daily precipitation data has a STATE information combined with a city - daily precipitation data is the most common data served by the DWD Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files folder: local file system folder where files should be stored write_file: writes the meta data file to the local file system create_new_filelist: if true: a new file_list for metadata will be created Returns: """ if not isinstance(parameter, Parameter): raise TypeError("Error: 'parameter' is not of type Parameter(Enum).") if not isinstance(time_resolution, TimeResolution): raise TypeError( "Error: 'time_resolution' is not of type TimeResolution(Enum).") if not isinstance(period_type, PeriodType): raise TypeError( "Error: 'period_type' is not of type PeriodType(Enum).") if not isinstance(folder, str): raise TypeError("Error: 'folder' is not a string.") if not isinstance(write_file, bool): raise TypeError("Error: 'write_file' is not a bool.") if not isinstance(create_new_filelist, bool): raise TypeError("Error: 'create_new_filelist' is not a bool.") check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type) file_path = create_metainfo_fpath(folder, parameter, period_type, time_resolution) if check_file_exist(file_path) and not create_new_filelist: metainfo = pd.read_csv(filepath_or_buffer=file_path) return metainfo if time_resolution == TimeResolution.MINUTE_1: metainfo = metaindex_for_1minute_data(parameter=parameter, time_resolution=time_resolution) else: metainfo = create_metaindex(parameter=parameter, time_resolution=time_resolution, period_type=period_type) if all(pd.isnull(metainfo[DWDColumns.STATE.value])): # @todo avoid calling function in function -> we have to build a function around to manage missing data mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE, TimeResolution.DAILY, PeriodType.HISTORICAL, folder=folder, write_file=False, create_new_filelist=False) stateinfo = pd.merge( metainfo[DWDColumns.STATION_ID], mdp.loc[:, [DWDColumns.STATION_ID.value, DWDColumns.STATE.value]], how="left") metainfo[DWDColumns.STATE.value] = stateinfo[DWDColumns.STATE.value] # for station, state in mdp.loc[:, [DWDColumns.STATIONNAME.value, DWDColumns.STATE.value]]: # metainfo.loc[metainfo[DWDColumns.STATIONNAME.value] == station, DWDColumns.STATE.value] = state metainfo = add_filepresence(metainfo=metainfo, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, create_new_filelist=create_new_filelist) if write_file and not check_file_exist(file_path) and not \ create_new_filelist: remove_old_file(file_type=METADATA_NAME, file_postfix=DATA_FORMAT, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, subfolder=DWD_FOLDER_METADATA) metainfo.to_csv(path_or_buf=file_path, header=True, index=False) return metainfo
def parse_dwd_data(local_files: List[Path], keep_zip: bool = False) -> pd.DataFrame: """ This function is used to read the stationdata for which the local zip link is provided by the 'download_dwd' function. It checks the zipfile from the link for its parameters, opens every zipfile in the list of files and reads in the containing product file, and if there's an error or it's wanted the zipfile is removed afterwards. Args: local_files: list of local stored files that should be read keep_zip: If true: The raw zip file will not be deleted, Default is: False. Returns: DataFrame with requested data """ # Test for types of input parameters assert isinstance(local_files, list) assert isinstance(keep_zip, bool) # Check for files and if empty return empty DataFrame if not local_files: return pd.DataFrame() first_filename = str(local_files[0]).split("/")[-1] parameter, time_resolution, period_type = determine_parameters( first_filename) check_parameters(parameter, time_resolution, period_type) data = [] for file in local_files: # Try doing everything without know of the existance of file try: with ZipFile(file) as zip_file: # List of fileitems in zipfile zip_file_files = zip_file.infolist() # List of filenames of fileitems zip_file_files = [ zip_file_file.filename for zip_file_file in zip_file_files ] # Filter file with 'produkt' in filename file_data = [ zip_file_file for zip_file_file in zip_file_files if all([ matchstring in zip_file_file.lower() for matchstring in STATIONDATA_MATCHSTRINGS ]) ] # List to filename file_data = file_data.pop(0) with zip_file.open(file_data) as file_opened: # Read data into a dataframe data_file = pd.read_csv(filepath_or_buffer=file_opened, sep=";", na_values="-999") # Append dataframe to list of all data read data.append(data_file) except Exception: # In case something goes wrong there's a print print(f'''The zipfile {file} couldn't be opened/read and will be removed.''') # Data will be removed Path(file).unlink() finally: # If file shouldn't be kept remove it if not keep_zip: Path(file).unlink() # Put together list of files to a DataFrame data = pd.concat(data) # Extract column names column_names = data.columns # Strip empty chars from before and after column names column_names = [ column_name.upper().strip() for column_name in column_names ] # Replace certain names by conform names column_names = [ GERMAN_TO_ENGLISH_COLUMNS_MAPPING.get(column_name, column_name) for column_name in column_names ] # Reassign column names to DataFrame data.columns = column_names # String to date data[DATE_NAME] = data[DATE_NAME].apply( lambda date: dt.strptime(str(date), "%Y%m%d")) return data