示例#1
0
def test_ncs_reader_kwargs_contains_remove_status_codes(remove_status_codes):
    # Creates a DataLakeProvider with remove_status_codes as kwargs
    data_provider = DataLakeProvider(interactive=False,
                                     remove_status_codes=remove_status_codes)

    # Set the data_provider's client to the AzureDLFileSystemMock as interactive can be False.
    data_provider.client = AzureDLFileSystemMock()
    # Get the ncs_reader from data_provider.
    ncs_reader = data_provider._get_sub_dataproviders()[0]

    # Cheks that the kwargs remove_status_codes has been passed to the sub_provider
    expected = [] if remove_status_codes == [] else [0]
    assert ncs_reader.remove_status_codes == expected
示例#2
0
def test_faked_DataLakeBackedDataset(MockDataset):

    provider = DataLakeProvider(storename="dataplatformdlsprod", interactive=True)
    dataset = TimeSeriesDataset(data_provider=provider, **CONFIG)

    # Should be able to call get_data without being asked to authenticate in tests
    X, y = dataset.get_data()
def test_get_data_serviceauth_fail(caplog):
    train_start_date = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00")
    train_end_date = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00")

    dataset_config = _get_default_dataset_config()
    dataset_config["train_start_date"] = train_start_date
    dataset_config["train_end_date"] = train_end_date
    dataset_config["data_provider"] = DataLakeProvider(
        dl_service_auth_str="TENTANT_UNKNOWN:BOGUS:PASSWORD")

    dl_backed = dataset._get_dataset(dataset_config)

    with pytest.raises(adal.adal_error.AdalError), caplog.at_level(
            logging.CRITICAL):
        dl_backed.get_data()
def _get_default_dataset_config():
    train_start_date = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00")
    train_end_date = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00")
    return {
        "type":
        "TimeSeriesDataset",
        "train_start_date":
        train_start_date,
        "train_end_date":
        train_end_date,
        "tag_list":
        normalize_sensor_tags(["TRC-FIQ -39-0706", "GRA-EM-23-0003ARV.PV"]),
        "data_provider":
        DataLakeProvider(),
    }
def test_get_data_serviceauth_in_config():
    dataset_config = _get_default_dataset_config()
    dataset_config["data_provider"] = DataLakeProvider(
        dl_service_auth_str=os.getenv("TEST_SERVICE_AUTH"))
    dataset_config["resolution"] = "10T"
    dl_backed = dataset._get_dataset(dataset_config)
    data, _ = dl_backed.get_data()

    assert dataset_config["tag_list"] == list(data.columns.values)

    expected_rows = 7
    assert (
        len(data) == expected_rows
    ), f"Default resolution 10 minutes should give {expected_rows} rows"

    assert (not data.isnull().values.any()
            ), "Resulting dataframe should not have any NaNs"
示例#6
0
    def __init__(
        self,
        train_start_date: Union[datetime, str],
        train_end_date: Union[datetime, str],
        tag_list: Sequence[Union[str, Dict, SensorTag]],
        target_tag_list: Optional[Sequence[Union[str, Dict, SensorTag]]] = None,
        data_provider: Union[GordoBaseDataProvider, dict] = DataLakeProvider(),
        resolution: Optional[str] = "10T",
        row_filter: Union[str, list] = "",
        known_filter_periods: Optional[list] = [],
        aggregation_methods: Union[str, List[str], Callable] = "mean",
        row_filter_buffer_size: int = 0,
        asset: Optional[str] = None,
        default_asset: Optional[str] = None,
        n_samples_threshold: int = 0,
        low_threshold: Optional[int] = -1000,
        high_threshold: Optional[int] = 50000,
        interpolation_method: str = "linear_interpolation",
        interpolation_limit: str = "8H",
        filter_periods: Optional[dict] = {},
        tag_normalizer: Union[str, Callable[..., List[SensorTag]]] = "default",
    ):
        """
        Creates a TimeSeriesDataset backed by a provided dataprovider.

        A TimeSeriesDataset is a dataset backed by timeseries, but resampled,
        aligned, and (optionally) filtered.

        Parameters
        ----------
        train_start_date: Union[datetime, str]
            Earliest possible point in the dataset (inclusive)
        train_end_date: Union[datetime, str]
            Earliest possible point in the dataset (exclusive)
        tag_list: Sequence[Union[str, Dict, sensor_tag.SensorTag]]
            List of tags to include in the dataset. The elements can be strings,
            dictionaries or SensorTag namedtuples.
        target_tag_list: Sequence[List[Union[str, Dict, sensor_tag.SensorTag]]]
            List of tags to set as the dataset y. These will be treated the same as
            tag_list when fetching and pre-processing (resampling) but will be split
            into the y return from ``.get_data()``
        data_provider: Union[GordoBaseDataProvider, dict]
            A dataprovider which can provide dataframes for tags from train_start_date to train_end_date
            of which can also be a config definition from a data provider's ``.to_dict()`` method.
        resolution: Optional[str]
            The bucket size for grouping all incoming time data (e.g. "10T").
            Available strings come from https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
            **Note**: If this parameter is ``None`` or ``False``, then _no_ aggregation/resampling is applied to the data.
        row_filter: str or list
            Filter on the rows. Only rows satisfying the filter will be in the dataset.
            See :func:`gordo.machine.dataset.filter_rows.pandas_filter_rows` for
            further documentation of the filter format.
        known_filter_periods: list
            List of periods to drop in the format [~('2020-04-08 04:00:00+00:00' < index < '2020-04-08 10:00:00+00:00')].
            Note the time-zone suffix (+00:00), which is required.
        aggregation_methods
            Aggregation method(s) to use for the resampled buckets. If a single
            resample method is provided then the resulting dataframe will have names
            identical to the names of the series it got in. If several
            aggregation-methods are provided then the resulting dataframe will
            have a multi-level column index, with the series-name as the first level,
            and the aggregation method as the second level.
            See :py:func::`pandas.core.resample.Resampler#aggregate` for more
            information on possible aggregation methods.
        row_filter_buffer_size: int
            Whatever elements are selected for removal based on the ``row_filter``, will also
            have this amount of elements removed fore and aft.
            Default is zero 0
        asset: Optional[str]
            Asset for which the tags are associated with.
        default_asset: Optional[str]
            Asset which will be used if `asset` is not provided and the tag is not
            resolvable to a specific asset.
        n_samples_threshold: int = 0
            The threshold at which the generated DataFrame is considered to have too few rows of data.
        interpolation_method: str
            How should missing values be interpolated. Either forward fill (`ffill`) or by linear
            interpolation (default, `linear_interpolation`).
        interpolation_limit: str
            Parameter sets how long from last valid data point values will be interpolated/forward filled.
            Default is eight hours (`8H`).
            If None, all missing values are interpolated/forward filled.
        fiter_periods: dict
            Performs a series of algorithms that drops noisy data is specified.
            See `filter_periods` class for details.
        tag_normalizer: Union[str, Callable[..., List[SensorTag]]]
            `default` is only one suitable value for now,
            uses ``gordo.machine.dataset.sensor_tag.normalize_sensor_tags`` in this case

        """
        self.train_start_date = self._validate_dt(train_start_date)
        self.train_end_date = self._validate_dt(train_end_date)

        if self.train_start_date >= self.train_end_date:
            raise ValueError(
                f"train_end_date ({self.train_end_date}) must be after train_start_date ({self.train_start_date})"
            )

        if isinstance(tag_normalizer, str):
            if tag_normalizer not in self.TAG_NORMALIZERS:
                raise ValueError(
                    "Unsupported tag_normalizer type '%s'" % tag_normalizer
                )
            tag_normalizer = self.TAG_NORMALIZERS[tag_normalizer]
        self.tag_normalizer = tag_normalizer

        self.asset = asset
        self.default_asset = default_asset

        self.tag_list = self.tag_normalizer(list(tag_list), asset, default_asset)
        self.target_tag_list = (
            self.tag_normalizer(list(target_tag_list), asset, default_asset)
            if target_tag_list
            else self.tag_list.copy()
        )
        self.resolution = resolution
        self.data_provider = (
            data_provider
            if not isinstance(data_provider, dict)
            else GordoBaseDataProvider.from_dict(data_provider)
        )
        self.row_filter = row_filter
        self.aggregation_methods = aggregation_methods
        self.row_filter_buffer_size = row_filter_buffer_size
        self.n_samples_threshold = n_samples_threshold
        self.low_threshold = low_threshold
        self.high_threshold = high_threshold
        self.interpolation_method = interpolation_method
        self.interpolation_limit = interpolation_limit
        self.filter_periods = (
            FilterPeriods(granularity=self.resolution, **filter_periods)
            if filter_periods
            else None
        )
        self.known_filter_periods = known_filter_periods

        if not self.train_start_date.tzinfo or not self.train_end_date.tzinfo:
            raise ValueError(
                f"Timestamps ({self.train_start_date}, {self.train_end_date}) need to include timezone "
                f"information"
            )

        super().__init__()
示例#7
0
    def __init__(
        self,
        train_start_date: Union[datetime, str],
        train_end_date: Union[datetime, str],
        tag_list: Sequence[Union[str, Dict, SensorTag]],
        target_tag_list: Optional[Sequence[Union[str, Dict,
                                                 SensorTag]]] = None,
        data_provider: Union[GordoBaseDataProvider, dict] = DataLakeProvider(),
        resolution: Optional[str] = "10T",
        row_filter: str = "",
        aggregation_methods: Union[str, List[str], Callable] = "mean",
        row_filter_buffer_size: int = 0,
        asset: Optional[str] = None,
        default_asset: Optional[str] = None,
        n_samples_threshold: int = 0,
        **_kwargs,
    ):
        """
        Creates a TimeSeriesDataset backed by a provided dataprovider.

        A TimeSeriesDataset is a dataset backed by timeseries, but resampled,
        aligned, and (optionally) filtered.

        Parameters
        ----------
        train_start_date: Union[datetime, str]
            Earliest possible point in the dataset (inclusive)
        train_end_date: Union[datetime, str]
            Earliest possible point in the dataset (exclusive)
        tag_list: Sequence[Union[str, Dict, sensor_tag.SensorTag]]
            List of tags to include in the dataset. The elements can be strings,
            dictionaries or SensorTag namedtuples.
        target_tag_list: Sequence[List[Union[str, Dict, sensor_tag.SensorTag]]]
            List of tags to set as the dataset y. These will be treated the same as
            tag_list when fetching and pre-processing (resampling) but will be split
            into the y return from ``.get_data()``
        data_provider: Union[GordoBaseDataProvider, dict]
            A dataprovider which can provide dataframes for tags from train_start_date to train_end_date
            of which can also be a config definition from a data provider's ``.to_dict()`` method.
        resolution: Optional[str]
            The bucket size for grouping all incoming time data (e.g. "10T").
            Available strings come from https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
            **Note**: If this parameter is ``None`` or ``False``, then _no_ aggregation/resampling is applied to the data.
        row_filter: str
            Filter on the rows. Only rows satisfying the filter will be in the dataset.
            See :func:`gordo.machine.dataset.filter_rows.pandas_filter_rows` for
            further documentation of the filter format.
        aggregation_methods
            Aggregation method(s) to use for the resampled buckets. If a single
            resample method is provided then the resulting dataframe will have names
            identical to the names of the series it got in. If several
            aggregation-methods are provided then the resulting dataframe will
            have a multi-level column index, with the series-name as the first level,
            and the aggregation method as the second level.
            See :py:func::`pandas.core.resample.Resampler#aggregate` for more
            information on possible aggregation methods.
        row_filter_buffer_size: int
            Whatever elements are selected for removal based on the ``row_filter``, will also
            have this amount of elements removed fore and aft.
            Default is zero 0
        asset: Optional[str]
            Asset for which the tags are associated with.
        default_asset: Optional[str]
            Asset which will be used if `asset` is not provided and the tag is not
            resolvable to a specific asset.
        n_samples_threshold: int = 0
            The threshold at which the generated DataFrame is considered to have too few rows of data.
        _kwargs
        """
        self.train_start_date = self._validate_dt(train_start_date)
        self.train_end_date = self._validate_dt(train_end_date)

        if self.train_start_date >= self.train_end_date:
            raise ValueError(
                f"train_end_date ({self.train_end_date}) must be after train_start_date ({self.train_start_date})"
            )

        self.tag_list = normalize_sensor_tags(list(tag_list), asset,
                                              default_asset)
        self.target_tag_list = (normalize_sensor_tags(list(target_tag_list),
                                                      asset, default_asset)
                                if target_tag_list else self.tag_list.copy())
        self.resolution = resolution
        self.data_provider = (data_provider
                              if not isinstance(data_provider, dict) else
                              GordoBaseDataProvider.from_dict(data_provider))
        self.row_filter = row_filter
        self.aggregation_methods = aggregation_methods
        self.row_filter_buffer_size = row_filter_buffer_size
        self.asset = asset
        self.n_samples_threshold = n_samples_threshold

        if not self.train_start_date.tzinfo or not self.train_end_date.tzinfo:
            raise ValueError(
                f"Timestamps ({self.train_start_date}, {self.train_end_date}) need to include timezone "
                f"information")
def test_get_data_interactive():
    dataset_config = _get_default_dataset_config()
    dataset_config["data_provider"] = DataLakeProvider(interactive=True)
    dl_backed = dataset._get_dataset(dataset_config)
    data = dl_backed.get_data()
    assert len(data) >= 0