def __init__( self, parameter_set: Union[str, DWDObservationParameterSet], resolution: Union[str, DWDObservationResolution], period: Union[str, DWDObservationPeriod] = None, start_date: Union[None, str, Timestamp] = None, end_date: Union[None, str, Timestamp] = None, ): super().__init__(start_date=start_date, end_date=end_date) parameter_set = parse_enumeration_from_template( parameter_set, DWDObservationParameterSet ) resolution = parse_enumeration_from_template( resolution, DWDObservationResolution ) period = parse_enumeration_from_template(period, DWDObservationPeriod) if not check_dwd_observations_parameter_set(parameter_set, resolution, period): raise InvalidParameterCombination( f"The combination of {parameter_set.value}, {resolution.value}, " f"{period.value} is invalid." ) self.parameter = parameter_set self.resolution = resolution self.period = period
def __init__( self, parameter_set: Union[str, DWDObservationParameterSet], resolution: Union[str, DWDObservationResolution], period: Union[str, DWDObservationPeriod] = None, start_date: Union[None, str, Timestamp] = None, end_date: Union[None, str, Timestamp] = None, ): """ :param parameter_set: parameter set str/enumeration :param resolution: resolution str/enumeration :param period: period str/enumeration :param start_date: start date to limit the stations :param end_date: end date to limit the stations """ super().__init__(start_date=start_date, end_date=end_date) parameter_set = parse_enumeration_from_template( parameter_set, DWDObservationParameterSet) resolution = parse_enumeration_from_template(resolution, DWDObservationResolution) period = parse_enumeration_from_template(period, DWDObservationPeriod) # TODO: move to _all and replace error with logging + empty dataframe if not check_dwd_observations_parameter_set(parameter_set, resolution, period): raise InvalidParameterCombination( f"The combination of {parameter_set.value}, {resolution.value}, " f"{period.value} is invalid.") self.parameter = parameter_set self.resolution = resolution self.period = period
def collect_climate_observations_data( station_id: int, parameter_set: DWDObservationParameterSet, resolution: DWDObservationResolution, period: DWDObservationPeriod, ) -> pd.DataFrame: """ Function that organizes the complete pipeline of data collection, either from the internet or from a local file. It therefore goes through every given station id and, given by the parameters, either tries to get data from local store and/or if fails tries to get data from the internet. Finally if wanted it will try to store the data in a hdf file. :param station_id: station id that is being loaded :param parameter_set: Parameter as enumeration :param resolution: Time resolution as enumeration :param period: Period type as enumeration :return: All the data given by the station ids. """ if not check_dwd_observations_parameter_set(parameter_set, resolution, period): raise InvalidParameterCombination( f"Invalid combination: {parameter_set.value} / {resolution.value} / " f"{period.value}") remote_files = create_file_list_for_climate_observations( station_id, parameter_set, resolution, period) if len(remote_files) == 0: parameter_identifier = build_parameter_set_identifier( parameter_set, resolution, period, station_id) log.info( f"No files found for {parameter_identifier}. Station will be skipped." ) return pd.DataFrame() filenames_and_files = download_climate_observations_data_parallel( remote_files) obs_df = parse_climate_observations_data(filenames_and_files, parameter_set, resolution) obs_df = coerce_field_types(obs_df, resolution) return obs_df
def get_nearby_stations_by_number( latitude: float, longitude: float, num_stations_nearby: int, parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], minimal_available_date: Optional[Union[datetime, str]] = None, maximal_available_date: Optional[Union[datetime, str]] = None, ) -> pd.DataFrame: """ Provides a list of weather station ids for the requested data :param latitude: Latitude of location to search for nearest weather station :param longitude: Longitude of location to search for nearest weather station :param minimal_available_date: Start date of timespan where measurements should be available :param maximal_available_date: End date of timespan where measurements should be available :param parameter: Observation measure :param time_resolution: Frequency/granularity of measurement interval :param period_type: Recent or historical files :param num_stations_nearby: Number of stations that should be nearby :return: DataFrames with valid stations in radius per requested location """ if num_stations_nearby <= 0: raise ValueError("'num_stations_nearby' has to be at least 1.") parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) if not check_parameters(parameter, time_resolution, period_type): raise InvalidParameterCombination( f"The combination of {parameter.value}, {time_resolution.value}, " f"{period_type.value} is invalid.") minimal_available_date = (minimal_available_date if not minimal_available_date or isinstance(minimal_available_date, datetime) else parse_datetime(minimal_available_date)) maximal_available_date = (maximal_available_date if not minimal_available_date or isinstance(maximal_available_date, datetime) else parse_datetime(maximal_available_date)) if minimal_available_date and maximal_available_date: if minimal_available_date > maximal_available_date: raise ValueError("'minimal_available_date' has to be before " "'maximal_available_date'") coords = Coordinates(np.array(latitude), np.array(longitude)) metadata = metadata_for_climate_observations(parameter, time_resolution, period_type) # Filter only for stations that have a file metadata = metadata[metadata[DWDMetaColumns.HAS_FILE.value].values] if minimal_available_date: metadata = metadata[ metadata[DWDMetaColumns.FROM_DATE.value] <= minimal_available_date] if maximal_available_date: metadata = metadata[ metadata[DWDMetaColumns.TO_DATE.value] >= maximal_available_date] metadata = metadata.reset_index(drop=True) distances, indices_nearest_neighbours = _derive_nearest_neighbours( metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby) distances = pd.Series(distances) indices_nearest_neighbours = pd.Series(indices_nearest_neighbours) # If num_stations_nearby is higher then the actual amount of stations # further indices and distances are added which have to be filtered out distances = distances[:min(metadata.shape[0], num_stations_nearby)] indices_nearest_neighbours = indices_nearest_neighbours[:min( metadata.shape[0], num_stations_nearby)] distances_km = np.array(distances * KM_EARTH_RADIUS) metadata_location = metadata.iloc[ indices_nearest_neighbours, :].reset_index(drop=True) metadata_location[DWDMetaColumns.DISTANCE_TO_LOCATION.value] = distances_km if metadata_location.empty: logger.warning(f"No weather stations were found for coordinate " f"{latitude}°N and {longitude}°E ") return metadata_location
def get_nearby_stations( latitude: float, longitude: float, minimal_available_date: Union[datetime, str], maximal_available_date: Union[datetime, str], parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], num_stations_nearby: Optional[int] = None, max_distance_in_km: Optional[float] = None, ) -> pd.DataFrame: """ Provides a list of weather station ids for the requested data Args: latitude: latitude of location to search for nearest weather station longitude: longitude of location to search for nearest weather station minimal_available_date: Start date of timespan where measurements should be available maximal_available_date: End date of timespan where measurements should be available parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files num_stations_nearby: Number of stations that should be nearby max_distance_in_km: alternative filtering criteria, maximum distance to location in km Returns: DataFrames with valid Stations in radius per requested location """ if (num_stations_nearby and max_distance_in_km) and (num_stations_nearby and max_distance_in_km): raise ValueError( "Either set 'num_stations_nearby' or 'max_distance_in_km'.") if num_stations_nearby == 0: raise ValueError("'num_stations_nearby' has to be at least 1.") parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) minimal_available_date = (minimal_available_date if isinstance( minimal_available_date, datetime) else parse_datetime(minimal_available_date)) maximal_available_date = (maximal_available_date if isinstance( maximal_available_date, datetime) else parse_datetime(maximal_available_date)) if not check_parameters(parameter, time_resolution, period_type): raise InvalidParameterCombination( f"The combination of {parameter.value}, {time_resolution.value}, " f"{period_type.value} is invalid.") coords = Coordinates(np.array(latitude), np.array(longitude)) metadata = metadata_for_climate_observations(parameter, time_resolution, period_type) metadata = metadata[ (metadata[DWDMetaColumns.FROM_DATE.value] <= minimal_available_date) & (metadata[DWDMetaColumns.TO_DATE.value] >= maximal_available_date )].reset_index(drop=True) # For distance filtering make normal query including all stations if max_distance_in_km: num_stations_nearby = metadata.shape[0] distances, indices_nearest_neighbours = _derive_nearest_neighbours( metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby) # Require list of indices for consistency # Cast to np.array required for subset indices_nearest_neighbours = np.array( cast_to_list(indices_nearest_neighbours)) distances_km = np.array(distances * KM_EARTH_RADIUS) # Filter for distance based on calculated distances if max_distance_in_km: _in_max_distance_indices = np.where( distances_km <= max_distance_in_km)[0] indices_nearest_neighbours = indices_nearest_neighbours[ _in_max_distance_indices] distances_km = distances_km[_in_max_distance_indices] metadata_location = metadata.loc[ indices_nearest_neighbours if isinstance(indices_nearest_neighbours, ( list, np.ndarray)) else [indices_nearest_neighbours], :, ] metadata_location["DISTANCE_TO_LOCATION"] = distances_km if metadata_location.empty: logger.warning(f"No weather station was found for coordinate " f"{latitude}°N and {longitude}°E ") return metadata_location
def collect_climate_observations_data( station_ids: List[int], parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], folder: Union[str, Path] = DWD_FOLDER_MAIN, prefer_local: bool = False, write_file: bool = False, tidy_data: bool = True, humanize_column_names: bool = False, run_download_only: bool = False, create_new_file_index: bool = False, ) -> Optional[pd.DataFrame]: """ Function that organizes the complete pipeline of data collection, either from the internet or from a local file. It therefor goes through every given station id and, given by the parameters, either tries to get data from local store and/or if fails tries to get data from the internet. Finally if wanted it will try to store the data in a hdf file. Args: station_ids: station ids that are trying to be loaded parameter: parameter as enumeration time_resolution: time resolution as enumeration period_type: period type as enumeration folder: folder for local file interaction prefer_local: boolean for if local data should be preferred write_file: boolean to write data to local storage tidy_data: boolean to tidy up data so that there's only one set of values for a datetime in a row e.g. station_id, parameter, element, datetime, value, quality humanize_column_names: boolean to yield column names better for human consumption run_download_only: boolean to run only the download and storing process create_new_file_index: boolean if to create a new file index for the data selection Returns: a pandas DataFrame with all the data given by the station ids """ parameter = parse_enumeration_from_template(parameter, Parameter) time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution) period_type = parse_enumeration_from_template(period_type, PeriodType) if not check_parameters(parameter, time_resolution, period_type): raise InvalidParameterCombination( f"The combination of {parameter.value}, {time_resolution.value}, " f"{period_type.value} is invalid." ) if create_new_file_index: reset_file_index_cache() # List for collected pandas DataFrames per each station id data = [] for station_id in set(station_ids): request_string = _build_local_store_key( station_id, parameter, time_resolution, period_type ) if prefer_local: # Try restoring data station_data = restore_climate_observations( station_id, parameter, time_resolution, period_type, folder ) # When successful append data and continue with next iteration if not station_data.empty: log.info(f"Data for {request_string} restored from local.") data.append(station_data) continue log.info(f"Data for {request_string} will be collected from internet.") remote_files = create_file_list_for_climate_observations( [station_id], parameter, time_resolution, period_type ) if len(remote_files) == 0: log.info(f"No files found for {request_string}. Station will be skipped.") continue filenames_and_files = download_climate_observations_data_parallel(remote_files) station_data = parse_climate_observations_data( filenames_and_files, parameter, time_resolution ) if write_file: store_climate_observations( station_data, station_id, parameter, time_resolution, period_type, folder, ) data.append(station_data) if run_download_only: return None try: data = pd.concat(data) except ValueError: return pd.DataFrame() if tidy_data: data = _tidy_up_data(data, parameter) # Assign meaningful column names (humanized). if humanize_column_names: hcnm = create_humanized_column_names_mapping(time_resolution, parameter) if tidy_data: data[DWDMetaColumns.ELEMENT.value] = data[ DWDMetaColumns.ELEMENT.value ].apply(lambda x: hcnm[x]) else: data = data.rename(columns=hcnm) return data