def test_parse_enumeration_from_template(): assert ( parse_enumeration_from_template("climate_summary", Parameter) == Parameter.CLIMATE_SUMMARY ) assert parse_enumeration_from_template("kl", Parameter) == Parameter.CLIMATE_SUMMARY with pytest.raises(InvalidParameter): parse_enumeration_from_template("climate", Parameter)
def metadata_for_climate_observations( parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], create_new_meta_index: bool = False, create_new_file_index: bool = False, ) -> pd.DataFrame: """ A main function to retrieve metadata for a set of parameters that creates a corresponding csv. STATE information is added to metadata for cases where there's no such named column (e.g. STATE) in the pandas.DataFrame. For this purpose we use daily precipitation data. That has two reasons: - daily precipitation data has a STATE information combined with a city - daily precipitation data is the most common data served by the DWD Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files create_new_meta_index: if true: a new meta index for metadata will be created create_new_file_index: if true: a new file index for metadata will be created Returns: pandas.DataFrame with metadata for selected parameters """ if create_new_meta_index: reset_meta_index_cache() if create_new_file_index: reset_file_index_cache() parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) meta_index = create_meta_index_for_climate_observations( parameter, time_resolution, period_type) meta_index[DWDMetaColumns.HAS_FILE.value] = False file_index = create_file_index_for_climate_observations( parameter, time_resolution, period_type) meta_index.loc[meta_index.loc[:, DWDMetaColumns.STATION_ID.value]. isin(file_index[DWDMetaColumns.STATION_ID.value]), DWDMetaColumns.HAS_FILE.value, ] = True return meta_index
def __init__( self, time_resolution: Union[str, TimeResolution], date_times: Optional[Union[str, List[Union[str, datetime]]]] = None, start_date: Optional[Union[str, datetime]] = None, end_date: Optional[Union[str, datetime]] = None, prefer_local: bool = False, write_file: bool = False, folder: Union[str, Path] = DWD_FOLDER_MAIN, ): time_resolution = parse_enumeration_from_template( time_resolution, TimeResolution ) if time_resolution not in (TimeResolution.HOURLY, TimeResolution.DAILY): raise ValueError("RADOLAN only supports hourly and daily resolution.") self.time_resolution = time_resolution if date_times == "latest": file_index_radolan = create_file_index_for_radolan(time_resolution) self.date_times = pd.Series( file_index_radolan[DWDMetaColumns.DATETIME.value][-1:] ) elif date_times: self.date_times = pd.Series( pd.to_datetime(date_times, infer_datetime_format=True) ) else: self.date_times = pd.Series( pd.date_range( pd.to_datetime(start_date, infer_datetime_format=True), pd.to_datetime(end_date, infer_datetime_format=True), ) ) self.date_times = self.date_times.dt.floor(freq="H") + pd.Timedelta(minutes=50) self.date_times = self.date_times.drop_duplicates().sort_values() self.prefer_local = prefer_local self.write_file = write_file self.folder = folder
def get_nearby_stations( latitude: float, longitude: float, minimal_available_date: Union[datetime, str], maximal_available_date: Union[datetime, str], parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], num_stations_nearby: Optional[int] = None, max_distance_in_km: Optional[float] = None, ) -> pd.DataFrame: """ Provides a list of weather station ids for the requested data Args: latitude: latitude of location to search for nearest weather station longitude: longitude of location to search for nearest weather station minimal_available_date: Start date of timespan where measurements should be available maximal_available_date: End date of timespan where measurements should be available parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files num_stations_nearby: Number of stations that should be nearby max_distance_in_km: alternative filtering criteria, maximum distance to location in km Returns: DataFrames with valid Stations in radius per requested location """ if (num_stations_nearby and max_distance_in_km) and (num_stations_nearby and max_distance_in_km): raise ValueError( "Either set 'num_stations_nearby' or 'max_distance_in_km'.") if num_stations_nearby == 0: raise ValueError("'num_stations_nearby' has to be at least 1.") parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) minimal_available_date = (minimal_available_date if isinstance( minimal_available_date, datetime) else parse_datetime(minimal_available_date)) maximal_available_date = (maximal_available_date if isinstance( maximal_available_date, datetime) else parse_datetime(maximal_available_date)) if not check_parameters(parameter, time_resolution, period_type): raise InvalidParameterCombination( f"The combination of {parameter.value}, {time_resolution.value}, " f"{period_type.value} is invalid.") coords = Coordinates(np.array(latitude), np.array(longitude)) metadata = metadata_for_climate_observations(parameter, time_resolution, period_type) metadata = metadata[ (metadata[DWDMetaColumns.FROM_DATE.value] <= minimal_available_date) & (metadata[DWDMetaColumns.TO_DATE.value] >= maximal_available_date )].reset_index(drop=True) # For distance filtering make normal query including all stations if max_distance_in_km: num_stations_nearby = metadata.shape[0] distances, indices_nearest_neighbours = _derive_nearest_neighbours( metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby) # Require list of indices for consistency # Cast to np.array required for subset indices_nearest_neighbours = np.array( cast_to_list(indices_nearest_neighbours)) distances_km = np.array(distances * KM_EARTH_RADIUS) # Filter for distance based on calculated distances if max_distance_in_km: _in_max_distance_indices = np.where( distances_km <= max_distance_in_km)[0] indices_nearest_neighbours = indices_nearest_neighbours[ _in_max_distance_indices] distances_km = distances_km[_in_max_distance_indices] metadata_location = metadata.loc[ indices_nearest_neighbours if isinstance(indices_nearest_neighbours, ( list, np.ndarray)) else [indices_nearest_neighbours], :, ] metadata_location["DISTANCE_TO_LOCATION"] = distances_km if metadata_location.empty: logger.warning(f"No weather station was found for coordinate " f"{latitude}°N and {longitude}°E ") return metadata_location
def __init__( self, station_ids: Union[str, int, List[Union[int, str]]], parameter: Union[str, Parameter, List[Union[str, Parameter]]], time_resolution: Union[str, TimeResolution], period_type: Union[ Union[None, str, PeriodType], List[Union[None, str, PeriodType]] ] = None, start_date: Union[None, str, Timestamp] = None, end_date: Union[None, str, Timestamp] = None, prefer_local: bool = False, write_file: bool = False, folder: Union[str, Path] = DWD_FOLDER_MAIN, tidy_data: bool = True, humanize_column_names: bool = False, create_new_file_index: bool = False, ) -> None: """ Class with mostly flexible arguments to define a request regarding DWD data. Special handling for period type. If start_date/end_date are given all period types are considered and merged together and the data is filtered for the given dates afterwards. Args: station_ids: definition of stations by str, int or list of str/int, will be parsed to list of int parameter: str or parameter enumeration defining the requested parameter time_resolution: str or time resolution enumeration defining the requested time resolution period_type: str or period type enumeration defining the requested period type start_date: replacement for period type to define exact time of requested data end_date: replacement for period type to define exact time of requested data prefer_local: definition if data should rather be taken from a local source write_file: should data be written to a local file folder: place where file lists (and station data) are stored tidy_data: reshape DataFrame to a more tidy, row based version of data humanize_column_names: replace column names by more meaningful ones create_new_file_index: definition if the file index should be recreated """ if not (period_type or start_date or end_date): raise ValueError( "Define either a 'time_resolution' or one of or both 'start_date' and " "'end_date' and leave 'time_resolution' empty!" ) try: self.station_ids = [ int(station_id) for station_id in cast_to_list(station_ids) ] except ValueError: raise ValueError("List of station id's can not be parsed to integers.") self.parameter = [] for p in cast_to_list(parameter): self.parameter.append(parse_enumeration_from_template(p, Parameter)) self.time_resolution = parse_enumeration_from_template( time_resolution, TimeResolution ) # start date and end date required for collect_data in any case self.start_date = None self.end_date = None if period_type: # For the case that a period_type is given, parse the period type(s) self.period_type = [] for pt in cast_to_list(period_type): if pt is None: self.period_type.append(None) else: self.period_type.append( parse_enumeration_from_template(pt, PeriodType) ) # Additional sorting required for self.period_type to ensure that for # multiple periods the data is first sourced from historical self.period_type = sorted(self.period_type) else: # working with ranges of data means expecting data to be laying between # periods, thus including all periods self.period_type = [ PeriodType.HISTORICAL, PeriodType.RECENT, PeriodType.NOW, ] # If only one date given, make the other one equal if not start_date: start_date = end_date if not end_date: end_date = start_date self.start_date = Timestamp(dateparser.parse(start_date)) self.end_date = Timestamp(dateparser.parse(end_date)) if not self.start_date <= self.end_date: raise StartDateEndDateError( "Error: 'start_date' must be smaller or equal to 'end_date'." ) self.prefer_local = prefer_local self.write_file = write_file self.folder = folder # If more then one parameter requested, automatically tidy data self.tidy_data = len(self.parameter) == 2 or tidy_data self.humanize_column_names = humanize_column_names self.create_new_file_index = create_new_file_index
def get_nearby_stations( latitudes: Union[List[float], np.array], longitudes: Union[List[float], np.array], parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], num_stations_nearby: Optional[int] = None, max_distance_in_km: Optional[float] = None, ) -> Tuple[List[int], List[List[float]]]: """ Provides a list of weather station ids for the requested data Args: latitudes: latitudes of locations to search for nearest weather station longitudes: longitudes of locations to search for nearest weather station parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files num_stations_nearby: Number of stations that should be nearby max_distance_in_km: alternative filtering criteria, maximum distance to location in km Returns: list of stations ids for the given locations/coordinate pairs and a list of distances in kilometer to the weather station """ if (num_stations_nearby and max_distance_in_km) and (num_stations_nearby and max_distance_in_km): raise ValueError( "Either set 'num_stations_nearby' or 'max_distance_in_km'.") if num_stations_nearby == 0: raise ValueError("'num_stations_nearby' has to be at least 1.") parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) if not check_parameters(parameter, time_resolution, period_type): raise InvalidParameterCombination( f"The combination of {parameter.value}, {time_resolution.value}, " f"{period_type.value} is invalid.") if not isinstance(latitudes, list): latitudes = np.array(latitudes) if not isinstance(longitudes, list): latitudes = np.array(longitudes) coords = Coordinates(latitudes, longitudes) metadata = metadata_for_dwd_data(parameter, time_resolution, period_type) # For distance filtering make normal query including all stations if max_distance_in_km: num_stations_nearby = metadata.shape[0] distances, indices_nearest_neighbours = _derive_nearest_neighbours( metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby) # Make sure go get list of lists [[dist1_1, dist1_2], [dist2_1, dist2_2]] if num_stations_nearby == 1: distances = np.array([distances]) else: distances = distances.T if np.max(indices_nearest_neighbours.shape) > 1: indices_nearest_neighbours = indices_nearest_neighbours[0] # Require list of indices for consistency # Cast to np.array required for subset indices_nearest_neighbours = np.array( cast_to_list(indices_nearest_neighbours)) distances_km = np.array(distances * KM_EARTH_RADIUS) # Filter for distance based on calculated distances if max_distance_in_km: indices_stations_in_distance = (np.max(distances_km, axis=1) <= max_distance_in_km) # Reduce stations to those in distance distances_km = distances_km[indices_stations_in_distance] indices_nearest_neighbours = indices_nearest_neighbours[ indices_stations_in_distance] return ( metadata.loc[indices_nearest_neighbours, DWDMetaColumns.STATION_ID.value].values.tolist(), distances_km.tolist(), )
def collect_climate_observations_data( station_ids: List[int], parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], folder: Union[str, Path] = DWD_FOLDER_MAIN, prefer_local: bool = False, write_file: bool = False, tidy_data: bool = True, humanize_column_names: bool = False, run_download_only: bool = False, create_new_file_index: bool = False, ) -> Optional[pd.DataFrame]: """ Function that organizes the complete pipeline of data collection, either from the internet or from a local file. It therefor goes through every given station id and, given by the parameters, either tries to get data from local store and/or if fails tries to get data from the internet. Finally if wanted it will try to store the data in a hdf file. Args: station_ids: station ids that are trying to be loaded parameter: parameter as enumeration time_resolution: time resolution as enumeration period_type: period type as enumeration folder: folder for local file interaction prefer_local: boolean for if local data should be preferred write_file: boolean to write data to local storage tidy_data: boolean to tidy up data so that there's only one set of values for a datetime in a row e.g. station_id, parameter, element, datetime, value, quality humanize_column_names: boolean to yield column names better for human consumption run_download_only: boolean to run only the download and storing process create_new_file_index: boolean if to create a new file index for the data selection Returns: a pandas DataFrame with all the data given by the station ids """ parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) if not check_parameters(parameter, time_resolution, period_type): raise InvalidParameterCombination( f"The combination of {parameter.value}, {time_resolution.value}, " f"{period_type.value} is invalid." ) if create_new_file_index: reset_file_index_cache() # List for collected pandas DataFrames per each station id data = [] for station_id in set(station_ids): request_string = _build_local_store_key( station_id, parameter, time_resolution, period_type ) if prefer_local: # Try restoring data station_data = restore_climate_observations( station_id, parameter, time_resolution, period_type, folder ) # When successful append data and continue with next iteration if not station_data.empty: log.info(f"Data for {request_string} restored from local.") data.append(station_data) continue log.info(f"Data for {request_string} will be collected from internet.") remote_files = create_file_list_for_climate_observations( [station_id], parameter, time_resolution, period_type ) if len(remote_files) == 0: log.info(f"No files found for {request_string}. Station will be skipped.") continue filenames_and_files = download_climate_observations_data_parallel(remote_files) station_data = parse_climate_observations_data( filenames_and_files, parameter, time_resolution ) if write_file: store_climate_observations( station_data, station_id, parameter, time_resolution, period_type, folder, ) data.append(station_data) if run_download_only: return None try: data = pd.concat(data) except ValueError: return pd.DataFrame() if tidy_data: data = _tidy_up_data(data, parameter) # Assign meaningful column names (humanized). if humanize_column_names: hcnm = create_humanized_column_names_mapping(time_resolution, parameter) if tidy_data: data[DWDMetaColumns.ELEMENT.value] = data[ DWDMetaColumns.ELEMENT.value ].apply(lambda x: hcnm[x]) else: data = data.rename(columns=hcnm) return data