Пример #1
0
def test_filter_periods_median(dataset):
    data, _ = dataset.get_data()
    data_filtered, drop_periods, predictions = FilterPeriods(
        granularity="10T", filter_method="median", n_iqr=1).filter_data(data)

    assert data.shape == (9063, 1)
    assert data["Tag 1"].mean() == 0.5113691034704841

    assert sum(predictions["median"]["pred"]) == -493
    assert len(drop_periods["median"]) == 44
    assert data_filtered.shape == (8570, 1)
Пример #2
0
def test_filter_periods_iforest_smoothing(dataset):
    data, _ = dataset.get_data()
    data_filtered, drop_periods, predictions = FilterPeriods(
        granularity="10T", filter_method="iforest",
        iforest_smooth=True).filter_data(data)

    assert data.shape == (9674, 1)
    assert data["Tag 1"].mean() == 0.5019862352609169

    assert sum(predictions["iforest"]["pred"]) == 8552
    assert len(drop_periods["iforest"]) == 41
    assert data_filtered.shape == (9113, 1)
Пример #3
0
def test_filter_periods_iforest(dataset):
    data, _ = dataset.get_data()
    data_filtered, drop_periods, predictions = FilterPeriods(
        granularity="10T", filter_method="iforest",
        iforest_smooth=False).filter_data(data)

    assert data.shape == (12838, 1)
    assert data["Tag 1"].mean() == 0.5144733352386245

    assert sum(predictions["iforest"]["pred"]) == 12066
    assert len(drop_periods["iforest"]) == 61
    assert data_filtered.shape == (12452, 1)
Пример #4
0
def test_filter_periods_all_smoothing(dataset):
    data, _ = dataset.get_data()
    data_filtered, drop_periods, predictions = FilterPeriods(
        granularity="10T", filter_method="all", n_iqr=1,
        iforest_smooth=True).filter_data(data)

    assert data.shape == (8595, 1)
    assert data["Tag 1"].mean() == 0.512856120233814

    assert sum(predictions["iforest"]["pred"]) == 7471
    assert len(drop_periods["median"]) == 39
    assert len(drop_periods["iforest"]) == 29
    assert data_filtered.shape == (7522, 1)
Пример #5
0
def test_filter_periods_all(dataset):
    data, _ = dataset.get_data()
    data_filtered, drop_periods, predictions = FilterPeriods(
        granularity="10T", filter_method="all", n_iqr=1,
        iforest_smooth=False).filter_data(data)

    assert data.shape == (8024, 1)
    assert data["Tag 1"].mean() == 0.500105748646813

    assert sum(predictions["median"]["pred"]) == -449
    assert sum(predictions["iforest"]["pred"]) == 7542
    assert len(drop_periods["median"]) == 39
    assert len(drop_periods["iforest"]) == 29
    assert data_filtered.shape == (7356, 1)
Пример #6
0
def test_filter_periods_typerror(dataset):
    data, _ = dataset.get_data()
    assert data.shape == (9760, 1)
    with pytest.raises(TypeError):
        FilterPeriods(granularity="10T", filter_method="abc", n_iqr=1)
Пример #7
0
    def __init__(
        self,
        train_start_date: Union[datetime, str],
        train_end_date: Union[datetime, str],
        tag_list: Sequence[Union[str, Dict, SensorTag]],
        target_tag_list: Optional[Sequence[Union[str, Dict, SensorTag]]] = None,
        data_provider: Union[GordoBaseDataProvider, dict] = DataLakeProvider(),
        resolution: Optional[str] = "10T",
        row_filter: Union[str, list] = "",
        known_filter_periods: Optional[list] = [],
        aggregation_methods: Union[str, List[str], Callable] = "mean",
        row_filter_buffer_size: int = 0,
        asset: Optional[str] = None,
        default_asset: Optional[str] = None,
        n_samples_threshold: int = 0,
        low_threshold: Optional[int] = -1000,
        high_threshold: Optional[int] = 50000,
        interpolation_method: str = "linear_interpolation",
        interpolation_limit: str = "8H",
        filter_periods: Optional[dict] = {},
        tag_normalizer: Union[str, Callable[..., List[SensorTag]]] = "default",
    ):
        """
        Creates a TimeSeriesDataset backed by a provided dataprovider.

        A TimeSeriesDataset is a dataset backed by timeseries, but resampled,
        aligned, and (optionally) filtered.

        Parameters
        ----------
        train_start_date: Union[datetime, str]
            Earliest possible point in the dataset (inclusive)
        train_end_date: Union[datetime, str]
            Earliest possible point in the dataset (exclusive)
        tag_list: Sequence[Union[str, Dict, sensor_tag.SensorTag]]
            List of tags to include in the dataset. The elements can be strings,
            dictionaries or SensorTag namedtuples.
        target_tag_list: Sequence[List[Union[str, Dict, sensor_tag.SensorTag]]]
            List of tags to set as the dataset y. These will be treated the same as
            tag_list when fetching and pre-processing (resampling) but will be split
            into the y return from ``.get_data()``
        data_provider: Union[GordoBaseDataProvider, dict]
            A dataprovider which can provide dataframes for tags from train_start_date to train_end_date
            of which can also be a config definition from a data provider's ``.to_dict()`` method.
        resolution: Optional[str]
            The bucket size for grouping all incoming time data (e.g. "10T").
            Available strings come from https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
            **Note**: If this parameter is ``None`` or ``False``, then _no_ aggregation/resampling is applied to the data.
        row_filter: str or list
            Filter on the rows. Only rows satisfying the filter will be in the dataset.
            See :func:`gordo.machine.dataset.filter_rows.pandas_filter_rows` for
            further documentation of the filter format.
        known_filter_periods: list
            List of periods to drop in the format [~('2020-04-08 04:00:00+00:00' < index < '2020-04-08 10:00:00+00:00')].
            Note the time-zone suffix (+00:00), which is required.
        aggregation_methods
            Aggregation method(s) to use for the resampled buckets. If a single
            resample method is provided then the resulting dataframe will have names
            identical to the names of the series it got in. If several
            aggregation-methods are provided then the resulting dataframe will
            have a multi-level column index, with the series-name as the first level,
            and the aggregation method as the second level.
            See :py:func::`pandas.core.resample.Resampler#aggregate` for more
            information on possible aggregation methods.
        row_filter_buffer_size: int
            Whatever elements are selected for removal based on the ``row_filter``, will also
            have this amount of elements removed fore and aft.
            Default is zero 0
        asset: Optional[str]
            Asset for which the tags are associated with.
        default_asset: Optional[str]
            Asset which will be used if `asset` is not provided and the tag is not
            resolvable to a specific asset.
        n_samples_threshold: int = 0
            The threshold at which the generated DataFrame is considered to have too few rows of data.
        interpolation_method: str
            How should missing values be interpolated. Either forward fill (`ffill`) or by linear
            interpolation (default, `linear_interpolation`).
        interpolation_limit: str
            Parameter sets how long from last valid data point values will be interpolated/forward filled.
            Default is eight hours (`8H`).
            If None, all missing values are interpolated/forward filled.
        fiter_periods: dict
            Performs a series of algorithms that drops noisy data is specified.
            See `filter_periods` class for details.
        tag_normalizer: Union[str, Callable[..., List[SensorTag]]]
            `default` is only one suitable value for now,
            uses ``gordo.machine.dataset.sensor_tag.normalize_sensor_tags`` in this case

        """
        self.train_start_date = self._validate_dt(train_start_date)
        self.train_end_date = self._validate_dt(train_end_date)

        if self.train_start_date >= self.train_end_date:
            raise ValueError(
                f"train_end_date ({self.train_end_date}) must be after train_start_date ({self.train_start_date})"
            )

        if isinstance(tag_normalizer, str):
            if tag_normalizer not in self.TAG_NORMALIZERS:
                raise ValueError(
                    "Unsupported tag_normalizer type '%s'" % tag_normalizer
                )
            tag_normalizer = self.TAG_NORMALIZERS[tag_normalizer]
        self.tag_normalizer = tag_normalizer

        self.asset = asset
        self.default_asset = default_asset

        self.tag_list = self.tag_normalizer(list(tag_list), asset, default_asset)
        self.target_tag_list = (
            self.tag_normalizer(list(target_tag_list), asset, default_asset)
            if target_tag_list
            else self.tag_list.copy()
        )
        self.resolution = resolution
        self.data_provider = (
            data_provider
            if not isinstance(data_provider, dict)
            else GordoBaseDataProvider.from_dict(data_provider)
        )
        self.row_filter = row_filter
        self.aggregation_methods = aggregation_methods
        self.row_filter_buffer_size = row_filter_buffer_size
        self.n_samples_threshold = n_samples_threshold
        self.low_threshold = low_threshold
        self.high_threshold = high_threshold
        self.interpolation_method = interpolation_method
        self.interpolation_limit = interpolation_limit
        self.filter_periods = (
            FilterPeriods(granularity=self.resolution, **filter_periods)
            if filter_periods
            else None
        )
        self.known_filter_periods = known_filter_periods

        if not self.train_start_date.tzinfo or not self.train_end_date.tzinfo:
            raise ValueError(
                f"Timestamps ({self.train_start_date}, {self.train_end_date}) need to include timezone "
                f"information"
            )

        super().__init__()