示例#1
0
def test_file_index_creation():
    reset_file_index_cache()

    # Existing combination of parameters
    file_index = create_file_index_for_dwd_server(
        Parameter.CLIMATE_SUMMARY, TimeResolution.DAILY, PeriodType.RECENT
    )

    assert not file_index.empty

    reset_file_index_cache()

    file_index2 = create_file_index_for_dwd_server(
        Parameter.CLIMATE_SUMMARY, TimeResolution.DAILY, PeriodType.RECENT
    )

    assert file_index.equals(file_index2)

    assert file_index.loc[
        file_index[DWDMetaColumns.STATION_ID.value] == 1048,
        DWDMetaColumns.FILENAME.value,
    ].values.tolist() == ["daily/kl/recent/tageswerte_KL_01048_akt.zip"]

    with pytest.raises(requests.exceptions.HTTPError):
        create_file_index_for_dwd_server(
            Parameter.CLIMATE_SUMMARY, TimeResolution.MINUTE_1, PeriodType.HISTORICAL
        )
def create_file_list_for_dwd_server(
    station_ids: List[int],
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    create_new_file_index: bool = False,
) -> List[str]:
    """
    Function for selecting datafiles (links to archives) for given
    station_ids, parameter, time_resolution and period_type under consideration of a
    created list of files that are
    available online.
    Args:
        station_ids: ids for the weather station to ask for data
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        create_new_file_index: set if new file index is created
    Returns:
        List of path's to file
    """
    if create_new_file_index:
        reset_file_index_cache()

    parameter = Parameter(parameter)
    time_resolution = TimeResolution(time_resolution)
    period_type = PeriodType(period_type)

    file_index = create_file_index_for_dwd_server(parameter, time_resolution,
                                                  period_type)

    file_index = file_index[file_index[DWDMetaColumns.STATION_ID.value].isin(
        station_ids)]

    return file_index[DWDMetaColumns.FILENAME.value].values.tolist()
示例#3
0
def metadata_for_climate_observations(
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    create_new_meta_index: bool = False,
    create_new_file_index: bool = False,
) -> pd.DataFrame:
    """
    A main function to retrieve metadata for a set of parameters that creates a
        corresponding csv.
    STATE information is added to metadata for cases where there's no such named
    column (e.g. STATE) in the pandas.DataFrame.
    For this purpose we use daily precipitation data. That has two reasons:
     - daily precipitation data has a STATE information combined with a city
     - daily precipitation data is the most common data served by the DWD
    Args:
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        create_new_meta_index: if true: a new meta index for metadata will
         be created
        create_new_file_index: if true: a new file index for metadata will
         be created
    Returns:
        pandas.DataFrame with metadata for selected parameters
    """
    if create_new_meta_index:
        reset_meta_index_cache()

    if create_new_file_index:
        reset_file_index_cache()

    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution,
                                                      TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    meta_index = create_meta_index_for_climate_observations(
        parameter, time_resolution, period_type)

    meta_index[DWDMetaColumns.HAS_FILE.value] = False

    file_index = create_file_index_for_climate_observations(
        parameter, time_resolution, period_type)

    meta_index.loc[meta_index.loc[:, DWDMetaColumns.STATION_ID.value].
                   isin(file_index[DWDMetaColumns.STATION_ID.value]),
                   DWDMetaColumns.HAS_FILE.value, ] = True

    return meta_index
示例#4
0
    def collect_data(self) -> Generator[pd.DataFrame, None, None]:
        """
        Method to collect data for a defined request. The function is build as generator
        in order to not cloak the memory thus if the user wants the data as one pandas
        DataFrame the generator has to be casted to a DataFrame manually via
        pd.concat(list(request.collect_data([...])).

        Args:
            same as init

        Returns:
            via a generator per station a pandas.DataFrame
        """
        if self.create_new_file_index:
            reset_file_index_cache()

        for station_id in self.station_ids:
            df_station = pd.DataFrame()

            for parameter in self.parameter:
                df_parameter_period = pd.DataFrame()

                for period_type in self.period_type:
                    try:
                        df_period = collect_dwd_data(
                            station_ids=[station_id],
                            parameter=parameter,
                            time_resolution=self.time_resolution,
                            period_type=period_type,
                            folder=self.folder,
                            prefer_local=self.prefer_local,
                            write_file=self.write_file,
                            tidy_data=self.tidy_data,
                            humanize_column_names=self.humanize_column_names,
                            create_new_file_index=False,
                        )
                    except InvalidParameterCombination:
                        log.info(
                            f"Combination for "
                            f"{parameter.value}/"
                            f"{self.time_resolution.value}/"
                            f"{period_type} does not exist and is skipped."
                        )

                        continue

                    # Filter out values which already are in the DataFrame
                    try:
                        df_period = df_period[
                            ~df_period[DWDMetaColumns.DATE.value].isin(
                                df_parameter_period[DWDMetaColumns.DATE.value]
                            )
                        ]
                    except KeyError:
                        pass

                    df_parameter_period = df_parameter_period.append(
                        df_period, ignore_index=True
                    )

                df_station = df_station.append(df_parameter_period, ignore_index=True)

            # Filter for dates range if start_date and end_date are defined
            if self.start_date:
                df_station = df_station[
                    (df_station[DWDMetaColumns.DATE.value] >= self.start_date)
                    & (df_station[DWDMetaColumns.DATE.value] <= self.end_date)
                ]

            # Empty dataframe should be skipped
            if df_station.empty:
                continue

            yield df_station
示例#5
0
def collect_climate_observations_data(
    station_ids: List[int],
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    folder: Union[str, Path] = DWD_FOLDER_MAIN,
    prefer_local: bool = False,
    write_file: bool = False,
    tidy_data: bool = True,
    humanize_column_names: bool = False,
    run_download_only: bool = False,
    create_new_file_index: bool = False,
) -> Optional[pd.DataFrame]:
    """
    Function that organizes the complete pipeline of data collection, either
    from the internet or from a local file. It therefor goes through every given
    station id and, given by the parameters, either tries to get data from local
    store and/or if fails tries to get data from the internet. Finally if wanted
    it will try to store the data in a hdf file.
    Args:
        station_ids: station ids that are trying to be loaded
        parameter: parameter as enumeration
        time_resolution: time resolution as enumeration
        period_type: period type as enumeration
        folder: folder for local file interaction
        prefer_local: boolean for if local data should be preferred
        write_file: boolean to write data to local storage
        tidy_data: boolean to tidy up data so that there's only one set of values for
        a datetime in a row
        e.g. station_id, parameter, element, datetime, value, quality
        humanize_column_names: boolean to yield column names better for
        human consumption
        run_download_only: boolean to run only the download and storing process
        create_new_file_index: boolean if to create a new file index for the
        data selection

    Returns:
        a pandas DataFrame with all the data given by the station ids
    """
    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid."
        )

    if create_new_file_index:
        reset_file_index_cache()

    # List for collected pandas DataFrames per each station id
    data = []
    for station_id in set(station_ids):
        request_string = _build_local_store_key(
            station_id, parameter, time_resolution, period_type
        )

        if prefer_local:
            # Try restoring data
            station_data = restore_climate_observations(
                station_id, parameter, time_resolution, period_type, folder
            )

            # When successful append data and continue with next iteration
            if not station_data.empty:
                log.info(f"Data for {request_string} restored from local.")

                data.append(station_data)

                continue

        log.info(f"Data for {request_string} will be collected from internet.")

        remote_files = create_file_list_for_climate_observations(
            [station_id], parameter, time_resolution, period_type
        )

        if len(remote_files) == 0:
            log.info(f"No files found for {request_string}. Station will be skipped.")
            continue

        filenames_and_files = download_climate_observations_data_parallel(remote_files)

        station_data = parse_climate_observations_data(
            filenames_and_files, parameter, time_resolution
        )

        if write_file:
            store_climate_observations(
                station_data,
                station_id,
                parameter,
                time_resolution,
                period_type,
                folder,
            )

        data.append(station_data)

    if run_download_only:
        return None

    try:
        data = pd.concat(data)
    except ValueError:
        return pd.DataFrame()

    if tidy_data:
        data = _tidy_up_data(data, parameter)

    # Assign meaningful column names (humanized).
    if humanize_column_names:
        hcnm = create_humanized_column_names_mapping(time_resolution, parameter)
        if tidy_data:
            data[DWDMetaColumns.ELEMENT.value] = data[
                DWDMetaColumns.ELEMENT.value
            ].apply(lambda x: hcnm[x])
        else:
            data = data.rename(columns=hcnm)

    return data