def test_ncs_reader_kwargs_contains_remove_status_codes(remove_status_codes): # Creates a DataLakeProvider with remove_status_codes as kwargs data_provider = DataLakeProvider(interactive=False, remove_status_codes=remove_status_codes) # Set the data_provider's client to the AzureDLFileSystemMock as interactive can be False. data_provider.client = AzureDLFileSystemMock() # Get the ncs_reader from data_provider. ncs_reader = data_provider._get_sub_dataproviders()[0] # Cheks that the kwargs remove_status_codes has been passed to the sub_provider expected = [] if remove_status_codes == [] else [0] assert ncs_reader.remove_status_codes == expected
def test_faked_DataLakeBackedDataset(MockDataset): provider = DataLakeProvider(storename="dataplatformdlsprod", interactive=True) dataset = TimeSeriesDataset(data_provider=provider, **CONFIG) # Should be able to call get_data without being asked to authenticate in tests X, y = dataset.get_data()
def test_get_data_serviceauth_fail(caplog): train_start_date = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00") train_end_date = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00") dataset_config = _get_default_dataset_config() dataset_config["train_start_date"] = train_start_date dataset_config["train_end_date"] = train_end_date dataset_config["data_provider"] = DataLakeProvider( dl_service_auth_str="TENTANT_UNKNOWN:BOGUS:PASSWORD") dl_backed = dataset._get_dataset(dataset_config) with pytest.raises(adal.adal_error.AdalError), caplog.at_level( logging.CRITICAL): dl_backed.get_data()
def _get_default_dataset_config(): train_start_date = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00") train_end_date = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00") return { "type": "TimeSeriesDataset", "train_start_date": train_start_date, "train_end_date": train_end_date, "tag_list": normalize_sensor_tags(["TRC-FIQ -39-0706", "GRA-EM-23-0003ARV.PV"]), "data_provider": DataLakeProvider(), }
def test_get_data_serviceauth_in_config(): dataset_config = _get_default_dataset_config() dataset_config["data_provider"] = DataLakeProvider( dl_service_auth_str=os.getenv("TEST_SERVICE_AUTH")) dataset_config["resolution"] = "10T" dl_backed = dataset._get_dataset(dataset_config) data, _ = dl_backed.get_data() assert dataset_config["tag_list"] == list(data.columns.values) expected_rows = 7 assert ( len(data) == expected_rows ), f"Default resolution 10 minutes should give {expected_rows} rows" assert (not data.isnull().values.any() ), "Resulting dataframe should not have any NaNs"
def __init__( self, train_start_date: Union[datetime, str], train_end_date: Union[datetime, str], tag_list: Sequence[Union[str, Dict, SensorTag]], target_tag_list: Optional[Sequence[Union[str, Dict, SensorTag]]] = None, data_provider: Union[GordoBaseDataProvider, dict] = DataLakeProvider(), resolution: Optional[str] = "10T", row_filter: Union[str, list] = "", known_filter_periods: Optional[list] = [], aggregation_methods: Union[str, List[str], Callable] = "mean", row_filter_buffer_size: int = 0, asset: Optional[str] = None, default_asset: Optional[str] = None, n_samples_threshold: int = 0, low_threshold: Optional[int] = -1000, high_threshold: Optional[int] = 50000, interpolation_method: str = "linear_interpolation", interpolation_limit: str = "8H", filter_periods: Optional[dict] = {}, tag_normalizer: Union[str, Callable[..., List[SensorTag]]] = "default", ): """ Creates a TimeSeriesDataset backed by a provided dataprovider. A TimeSeriesDataset is a dataset backed by timeseries, but resampled, aligned, and (optionally) filtered. Parameters ---------- train_start_date: Union[datetime, str] Earliest possible point in the dataset (inclusive) train_end_date: Union[datetime, str] Earliest possible point in the dataset (exclusive) tag_list: Sequence[Union[str, Dict, sensor_tag.SensorTag]] List of tags to include in the dataset. The elements can be strings, dictionaries or SensorTag namedtuples. target_tag_list: Sequence[List[Union[str, Dict, sensor_tag.SensorTag]]] List of tags to set as the dataset y. These will be treated the same as tag_list when fetching and pre-processing (resampling) but will be split into the y return from ``.get_data()`` data_provider: Union[GordoBaseDataProvider, dict] A dataprovider which can provide dataframes for tags from train_start_date to train_end_date of which can also be a config definition from a data provider's ``.to_dict()`` method. resolution: Optional[str] The bucket size for grouping all incoming time data (e.g. "10T"). Available strings come from https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects **Note**: If this parameter is ``None`` or ``False``, then _no_ aggregation/resampling is applied to the data. row_filter: str or list Filter on the rows. Only rows satisfying the filter will be in the dataset. See :func:`gordo.machine.dataset.filter_rows.pandas_filter_rows` for further documentation of the filter format. known_filter_periods: list List of periods to drop in the format [~('2020-04-08 04:00:00+00:00' < index < '2020-04-08 10:00:00+00:00')]. Note the time-zone suffix (+00:00), which is required. aggregation_methods Aggregation method(s) to use for the resampled buckets. If a single resample method is provided then the resulting dataframe will have names identical to the names of the series it got in. If several aggregation-methods are provided then the resulting dataframe will have a multi-level column index, with the series-name as the first level, and the aggregation method as the second level. See :py:func::`pandas.core.resample.Resampler#aggregate` for more information on possible aggregation methods. row_filter_buffer_size: int Whatever elements are selected for removal based on the ``row_filter``, will also have this amount of elements removed fore and aft. Default is zero 0 asset: Optional[str] Asset for which the tags are associated with. default_asset: Optional[str] Asset which will be used if `asset` is not provided and the tag is not resolvable to a specific asset. n_samples_threshold: int = 0 The threshold at which the generated DataFrame is considered to have too few rows of data. interpolation_method: str How should missing values be interpolated. Either forward fill (`ffill`) or by linear interpolation (default, `linear_interpolation`). interpolation_limit: str Parameter sets how long from last valid data point values will be interpolated/forward filled. Default is eight hours (`8H`). If None, all missing values are interpolated/forward filled. fiter_periods: dict Performs a series of algorithms that drops noisy data is specified. See `filter_periods` class for details. tag_normalizer: Union[str, Callable[..., List[SensorTag]]] `default` is only one suitable value for now, uses ``gordo.machine.dataset.sensor_tag.normalize_sensor_tags`` in this case """ self.train_start_date = self._validate_dt(train_start_date) self.train_end_date = self._validate_dt(train_end_date) if self.train_start_date >= self.train_end_date: raise ValueError( f"train_end_date ({self.train_end_date}) must be after train_start_date ({self.train_start_date})" ) if isinstance(tag_normalizer, str): if tag_normalizer not in self.TAG_NORMALIZERS: raise ValueError( "Unsupported tag_normalizer type '%s'" % tag_normalizer ) tag_normalizer = self.TAG_NORMALIZERS[tag_normalizer] self.tag_normalizer = tag_normalizer self.asset = asset self.default_asset = default_asset self.tag_list = self.tag_normalizer(list(tag_list), asset, default_asset) self.target_tag_list = ( self.tag_normalizer(list(target_tag_list), asset, default_asset) if target_tag_list else self.tag_list.copy() ) self.resolution = resolution self.data_provider = ( data_provider if not isinstance(data_provider, dict) else GordoBaseDataProvider.from_dict(data_provider) ) self.row_filter = row_filter self.aggregation_methods = aggregation_methods self.row_filter_buffer_size = row_filter_buffer_size self.n_samples_threshold = n_samples_threshold self.low_threshold = low_threshold self.high_threshold = high_threshold self.interpolation_method = interpolation_method self.interpolation_limit = interpolation_limit self.filter_periods = ( FilterPeriods(granularity=self.resolution, **filter_periods) if filter_periods else None ) self.known_filter_periods = known_filter_periods if not self.train_start_date.tzinfo or not self.train_end_date.tzinfo: raise ValueError( f"Timestamps ({self.train_start_date}, {self.train_end_date}) need to include timezone " f"information" ) super().__init__()
def __init__( self, train_start_date: Union[datetime, str], train_end_date: Union[datetime, str], tag_list: Sequence[Union[str, Dict, SensorTag]], target_tag_list: Optional[Sequence[Union[str, Dict, SensorTag]]] = None, data_provider: Union[GordoBaseDataProvider, dict] = DataLakeProvider(), resolution: Optional[str] = "10T", row_filter: str = "", aggregation_methods: Union[str, List[str], Callable] = "mean", row_filter_buffer_size: int = 0, asset: Optional[str] = None, default_asset: Optional[str] = None, n_samples_threshold: int = 0, **_kwargs, ): """ Creates a TimeSeriesDataset backed by a provided dataprovider. A TimeSeriesDataset is a dataset backed by timeseries, but resampled, aligned, and (optionally) filtered. Parameters ---------- train_start_date: Union[datetime, str] Earliest possible point in the dataset (inclusive) train_end_date: Union[datetime, str] Earliest possible point in the dataset (exclusive) tag_list: Sequence[Union[str, Dict, sensor_tag.SensorTag]] List of tags to include in the dataset. The elements can be strings, dictionaries or SensorTag namedtuples. target_tag_list: Sequence[List[Union[str, Dict, sensor_tag.SensorTag]]] List of tags to set as the dataset y. These will be treated the same as tag_list when fetching and pre-processing (resampling) but will be split into the y return from ``.get_data()`` data_provider: Union[GordoBaseDataProvider, dict] A dataprovider which can provide dataframes for tags from train_start_date to train_end_date of which can also be a config definition from a data provider's ``.to_dict()`` method. resolution: Optional[str] The bucket size for grouping all incoming time data (e.g. "10T"). Available strings come from https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects **Note**: If this parameter is ``None`` or ``False``, then _no_ aggregation/resampling is applied to the data. row_filter: str Filter on the rows. Only rows satisfying the filter will be in the dataset. See :func:`gordo.machine.dataset.filter_rows.pandas_filter_rows` for further documentation of the filter format. aggregation_methods Aggregation method(s) to use for the resampled buckets. If a single resample method is provided then the resulting dataframe will have names identical to the names of the series it got in. If several aggregation-methods are provided then the resulting dataframe will have a multi-level column index, with the series-name as the first level, and the aggregation method as the second level. See :py:func::`pandas.core.resample.Resampler#aggregate` for more information on possible aggregation methods. row_filter_buffer_size: int Whatever elements are selected for removal based on the ``row_filter``, will also have this amount of elements removed fore and aft. Default is zero 0 asset: Optional[str] Asset for which the tags are associated with. default_asset: Optional[str] Asset which will be used if `asset` is not provided and the tag is not resolvable to a specific asset. n_samples_threshold: int = 0 The threshold at which the generated DataFrame is considered to have too few rows of data. _kwargs """ self.train_start_date = self._validate_dt(train_start_date) self.train_end_date = self._validate_dt(train_end_date) if self.train_start_date >= self.train_end_date: raise ValueError( f"train_end_date ({self.train_end_date}) must be after train_start_date ({self.train_start_date})" ) self.tag_list = normalize_sensor_tags(list(tag_list), asset, default_asset) self.target_tag_list = (normalize_sensor_tags(list(target_tag_list), asset, default_asset) if target_tag_list else self.tag_list.copy()) self.resolution = resolution self.data_provider = (data_provider if not isinstance(data_provider, dict) else GordoBaseDataProvider.from_dict(data_provider)) self.row_filter = row_filter self.aggregation_methods = aggregation_methods self.row_filter_buffer_size = row_filter_buffer_size self.asset = asset self.n_samples_threshold = n_samples_threshold if not self.train_start_date.tzinfo or not self.train_end_date.tzinfo: raise ValueError( f"Timestamps ({self.train_start_date}, {self.train_end_date}) need to include timezone " f"information")
def test_get_data_interactive(): dataset_config = _get_default_dataset_config() dataset_config["data_provider"] = DataLakeProvider(interactive=True) dl_backed = dataset._get_dataset(dataset_config) data = dl_backed.get_data() assert len(data) >= 0