Пример #1
0
def test_collect_dwd_data_success():
    """ Test for data collection """
    """
    1. Scenario
    This scenario makes sure we take fresh data and write it to the given folder, thus
    we can run just another test afterwards as no old data is used
    """
    assert collect_climate_observations_data(
        station_ids=[1],
        parameter=Parameter.CLIMATE_SUMMARY,
        time_resolution=TimeResolution.DAILY,
        period_type=PeriodType.HISTORICAL,
        prefer_local=False,
        write_file=True,
        tidy_data=False,
    ).equals(TEST_FILE)

    """
    2. Scenario
    This scenario tries to get the data from the given folder. This data was placed by
    the first test and is now restored
    """
    assert collect_climate_observations_data(
        station_ids=[1],
        parameter=Parameter.CLIMATE_SUMMARY,
        time_resolution=TimeResolution.DAILY,
        period_type=PeriodType.HISTORICAL,
        prefer_local=True,
        write_file=True,
        tidy_data=False,
    ).equals(TEST_FILE)
Пример #2
0
def test_collect_daily_vanilla():
    """ Test for data collection with real data """

    data = collect_climate_observations_data(
        station_id=1048,
        parameter_set=DWDObservationParameterSet.CLIMATE_SUMMARY,
        resolution=DWDObservationResolution.DAILY,
        period=DWDObservationPeriod.RECENT,
    )

    assert list(data.columns.values) == [
        "STATION_ID",
        "DATE",
        "QN_3",
        "FX",
        "FM",
        "QN_4",
        "RSK",
        "RSKF",
        "SDK",
        "SHK_TAG",
        "NM",
        "VPM",
        "PM",
        "TMK",
        "UPM",
        "TXK",
        "TNK",
        "TGK",
    ]
Пример #3
0
def test_collect_daily_vanilla():
    """ Test for data collection with real data """

    data = collect_climate_observations_data(
        station_ids=[1048],
        parameter=Parameter.CLIMATE_SUMMARY,
        time_resolution=TimeResolution.DAILY,
        period_type=PeriodType.RECENT,
        tidy_data=False,
    )

    assert list(data.columns.values) == [
        "STATION_ID",
        "DATE",
        "QN_3",
        "FX",
        "FM",
        "QN_4",
        "RSK",
        "RSKF",
        "SDK",
        "SHK_TAG",
        "NM",
        "VPM",
        "PM",
        "TMK",
        "UPM",
        "TXK",
        "TNK",
        "TGK",
    ]
Пример #4
0
def test_collect_dwd_data_empty():
    """ Test for data collection with no available data """

    assert collect_climate_observations_data(
        station_id=1048,
        parameter_set=DWDObservationParameterSet.CLIMATE_SUMMARY,
        resolution=DWDObservationResolution.DAILY,
        period=DWDObservationPeriod.RECENT,
    ).empty
Пример #5
0
def test_collect_hourly_vanilla():
    """ Test for data collection with real data """

    data = collect_climate_observations_data(
        station_id=1048,
        parameter_set=DWDObservationParameterSet.TEMPERATURE_AIR,
        resolution=DWDObservationResolution.HOURLY,
        period=DWDObservationPeriod.RECENT,
    )

    assert list(data.columns.values) == [
        "STATION_ID",
        "DATE",
        "QN_9",
        "TT_TU",
        "RF_TU",
    ]
Пример #6
0
def test_collect_dwd_data_empty():
    """ Test for data collection with no available data """

    """
    1. Scenario
    Test for request where no data is available
    """

    assert collect_climate_observations_data(
        station_ids=[1048],
        parameter=Parameter.CLIMATE_SUMMARY,
        time_resolution=TimeResolution.DAILY,
        period_type=PeriodType.RECENT,
        prefer_local=True,
        write_file=False,
        tidy_data=False,
    ).empty
Пример #7
0
def test_collect_hourly_vanilla():
    """ Test for data collection with real data """

    data = collect_climate_observations_data(
        station_ids=[1048],
        parameter=Parameter.TEMPERATURE_AIR,
        time_resolution=TimeResolution.HOURLY,
        period_type=PeriodType.RECENT,
        tidy_data=False,
    )

    assert list(data.columns.values) == [
        "STATION_ID",
        "DATE",
        "QN_9",
        "TT_TU",
        "RF_TU",
    ]
Пример #8
0
    def collect_data(self) -> Generator[pd.DataFrame, None, None]:
        """
        Method to collect data for a defined request. The function is build as generator
        in order to not cloak the memory thus if the user wants the data as one pandas
        DataFrame the generator has to be casted to a DataFrame manually via
        pd.concat(list(request.collect_data()).

        :return: A generator yielding a pandas.DataFrame per station.
        """
        for station_id in self.station_ids:
            df_station = pd.DataFrame()

            for parameter in self.parameter:
                df_parameter_period = pd.DataFrame()

                for period_type in self.period_type:
                    try:
                        df_period = collect_climate_observations_data(
                            station_ids=[station_id],
                            parameter=parameter,
                            time_resolution=self.time_resolution,
                            period_type=period_type,
                            folder=self.folder,
                            prefer_local=self.prefer_local,
                            write_file=self.write_file,
                            tidy_data=self.tidy_data,
                            humanize_column_names=self.humanize_column_names,
                        )
                    except InvalidParameterCombination:
                        log.info(
                            f"Combination for "
                            f"{parameter.value}/"
                            f"{self.time_resolution.value}/"
                            f"{period_type} does not exist and is skipped.")

                        continue

                    # Filter out values which already are in the DataFrame
                    try:
                        df_period = df_period[
                            ~df_period[DWDMetaColumns.DATE.value].
                            isin(df_parameter_period[DWDMetaColumns.DATE.value]
                                 )]
                    except KeyError:
                        pass

                    df_parameter_period = df_parameter_period.append(
                        df_period, ignore_index=True)

                df_station = df_station.append(df_parameter_period,
                                               ignore_index=True)

            # Filter for dates range if start_date and end_date are defined
            if self.start_date:
                df_station = df_station[
                    (df_station[DWDMetaColumns.DATE.value] >= self.start_date)
                    & (df_station[DWDMetaColumns.DATE.value] <= self.end_date)]

            # Empty dataframe should be skipped
            if df_station.empty:
                continue

            yield df_station
Пример #9
0
    def _collect_data(
        self, station_id: int, parameter_set: DWDObservationParameterSet
    ) -> pd.DataFrame:
        """
        Method to collect data for one specified parameter. Manages restoring,
        collection and storing of data, transformation and combination of different
        periods.

        Args:
            station_id: station id for which parameter is collected
            parameter_set: chosen parameter that is collected

        Returns:
            pandas.DataFrame for given parameter of station
        """
        df_parameter = pd.DataFrame()

        for period_type in self.periods:
            parameter_identifier = build_parameter_set_identifier(
                parameter_set, self.resolution, period_type, station_id
            )

            storage = None
            if self.storage:
                storage = self.storage.hdf5(
                    parameter=parameter_set,
                    resolution=self.resolution,
                    period=period_type,
                )

                df_period = storage.restore(station_id)

                if not df_period.empty:
                    df_parameter = df_parameter.append(df_period)
                    continue

            log.info(f"Acquiring observations data for {parameter_identifier}.")

            try:
                df_period = collect_climate_observations_data(
                    station_id, parameter_set, self.resolution, period_type
                )
            except InvalidParameterCombination:
                log.info(
                    f"Invalid combination {parameter_set.value}/"
                    f"{self.resolution.value}/{period_type} is skipped."
                )

                df_period = pd.DataFrame()

            if self.storage and self.storage.persist:
                storage.store(station_id=station_id, df=df_period)

            # Filter out values which already are in the DataFrame
            try:
                df_period = df_period[
                    ~df_period[DWDMetaColumns.DATE.value].isin(
                        df_parameter[DWDMetaColumns.DATE.value]
                    )
                ]
            except KeyError:
                pass

            df_parameter = df_parameter.append(df_period)

        if self.tidy_data:
            df_parameter = df_parameter.dwd.tidy_up_data()

            df_parameter.insert(2, DWDMetaColumns.PARAMETER.value, parameter_set.name)

        # Assign meaningful column names (humanized).
        if self.humanize_column_names:
            hcnm = self._create_humanized_column_names_mapping(
                self.resolution, parameter_set
            )

            if self.tidy_data:
                df_parameter[DWDMetaColumns.ELEMENT.value] = df_parameter[
                    DWDMetaColumns.ELEMENT.value
                ].apply(lambda x: hcnm[x])
            else:
                df_parameter = df_parameter.rename(columns=hcnm)

        return df_parameter
Пример #10
0
    def _collect_station_parameter(
        self,
        station_id: str,
        parameter: Tuple[Union[DWDObservationParameter,
                               DWDObservationParameterSet],
                         DWDObservationParameterSet, ],
    ) -> pd.DataFrame:
        """
        Method to collect data for one specified parameter. Manages restoring,
        collection and storing of data, transformation and combination of different
        periods.

        Args:
            station_id: station id for which parameter is collected
            parameter: chosen parameter-parameter_set combination that is collected

        Returns:
            pandas.DataFrame for given parameter of station
        """
        parameter, parameter_set = parameter

        periods_and_date_ranges = []
        for period in self.periods:
            if (self.resolution in HIGH_RESOLUTIONS
                    and period == DWDObservationPeriod.HISTORICAL):
                date_ranges = self._get_historical_date_ranges(
                    station_id, parameter_set)

                for date_range in date_ranges:
                    periods_and_date_ranges.append((period, date_range))
            else:
                periods_and_date_ranges.append((period, None))

        parameter_df = pd.DataFrame()

        for period, date_range in periods_and_date_ranges:
            parameter_identifier = build_parameter_set_identifier(
                parameter_set, self.resolution, period, station_id, date_range)

            log.info(
                f"Acquiring observations data for {parameter_identifier}.")

            # TODO: integrate collect_climate_observations_data in class
            try:
                period_df = collect_climate_observations_data(
                    station_id, parameter_set, self.resolution, period,
                    date_range)
            except InvalidParameterCombination:
                log.info(f"Invalid combination {parameter_set.value}/"
                         f"{self.resolution.value}/{period} is skipped.")

                period_df = pd.DataFrame()

            # Filter out values which already are in the DataFrame
            try:
                period_df = period_df[~period_df[DWDMetaColumns.DATE.value].
                                      isin(parameter_df[DWDMetaColumns.DATE.
                                                        value])]
            except KeyError:
                pass

            parameter_df = parameter_df.append(period_df)

        if self.tidy_data:
            parameter_df = parameter_df.dwd.tidy_up_data()

            # TODO: remove this column and rather move it into metadata of resulting
            #  data model
            parameter_df.insert(2, DWDMetaColumns.PARAMETER_SET.value,
                                parameter_set.name)
            parameter_df[DWDMetaColumns.PARAMETER_SET.value] = parameter_df[
                DWDMetaColumns.PARAMETER_SET.value].astype("category")

        if parameter not in DWDObservationParameterSet:
            parameter_df = parameter_df[parameter_df[
                DWDMetaColumns.PARAMETER.value] == parameter.value]

        return parameter_df