def dataset(): return RandomDataset( train_start_date="2017-12-25 06:00:00Z", train_end_date="2017-12-29 06:00:00Z", tag_list=[SensorTag("Tag 1", None), SensorTag("Tag 2", None)], )
def test_aggregation_methods(): """Tests that it works to set aggregation method(s)""" kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), ) # Default aggregation gives no extra columns X, _ = TimeSeriesDataset(**kwargs).get_data() assert (83, 3) == X.shape # The default single aggregation method gives the tag-names as columns assert list(X.columns) == ["Tag 1", "Tag 2", "Tag 3"] # Using two aggregation methods give a multi-level column with tag-names # on top and aggregation_method as second level X, _ = TimeSeriesDataset(aggregation_methods=["mean", "max"], **kwargs).get_data() assert (83, 6) == X.shape assert list(X.columns) == [ ("Tag 1", "mean"), ("Tag 1", "max"), ("Tag 2", "mean"), ("Tag 2", "max"), ("Tag 3", "mean"), ("Tag 3", "max"), ]
def get_random_data(): data = { "type": "RandomDataset", "train_start_date": dateutil.parser.isoparse("2017-12-25 06:00:00Z"), "train_end_date": dateutil.parser.isoparse("2017-12-30 06:00:00Z"), "tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], "target_tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], } return data
def test_from_timeseries_to_gordo(): info = IntermediateFormat() info.from_time_series_api(tsapi_datas) # logger.info(f"tagnames: {info.tag_names} tagmap {info.tag_names_map} tagdata {info.tag_names_data}") assert info.tag_names == expected_intermediate_tag_names assert info.tag_names_map == expected_intermediate_tag_map assert info.tag_names_data == expected_intermediate_tag_data gordo_data = info.to_gordo( tags=[SensorTag(name="tag_1", asset="asset_1")], target_tags=[SensorTag(name="tag_2", asset="asset_2")], ) # logger.info(f"gordo_data: {gordo_data}") assert gordo_data == expected_gordo_data
def test_load_from_multiple_providers(self): """ Two tags, each belonging to different data producers, and both gets loaded """ series_collection = list( load_series_from_multiple_providers( [self.ab_producer, self.containing_b_producer], None, None, [SensorTag("abba", None), SensorTag("cba", None)], )) self.assertEqual(series_collection[0].name, "ab.*") self.assertEqual(series_collection[1].name, ".*b.*")
def test_load_multiple_raises_with_no_matches(self): """If no provider matches a tag then load_series_from_multiple_providers raises a ValueError when the generator is realized""" with self.assertRaises(ValueError): list( load_series_from_multiple_providers( [self.ab_producer, self.containing_b_producer], None, None, [ SensorTag("ab", None), SensorTag("tag_not_matching_any_of_the_regexps", None), ], ))
def test_trigger_tags(mock_tag_normalizer): data_provider = MockDataProvider() dataset = TimeSeriesDataset( data_provider=data_provider, tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), ], target_tag_list=[ SensorTag("Tag 5", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), row_filter="`Tag 3` > 0 & `Tag 4` > 1", tag_normalizer=mock_tag_normalizer, ) X, y = dataset.get_data() assert X is not None assert y is not None assert set(data_provider.last_tag_list) == { SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), SensorTag("Tag 4", None), SensorTag("Tag 5", None), } assert set(X.columns.values) == {"Tag 1", "Tag 2"} assert set(y.columns.values) == {"Tag 5"}
def test_timeseries_dataset_compat(): """ There are accepted keywords in the config file when using type: TimeSeriesDataset which don't actually match the kwargs of the dataset's __init__; for compatibility :func:`gordo.machine.dataset.datasets.compat` should adjust for these differences. """ dataset = TimeSeriesDataset( data_provider=MockDataProvider(), train_start_date="2017-12-25 06:00:00Z", train_end_date="2017-12-29 06:00:00Z", tags=[SensorTag("Tag 1", None)], ) assert dataset.train_start_date == dateutil.parser.isoparse("2017-12-25 06:00:00Z") assert dataset.train_end_date == dateutil.parser.isoparse("2017-12-29 06:00:00Z") assert dataset.tag_list == [SensorTag("Tag 1", None)]
def test_time_series_no_resolution(): kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), ) no_resolution, _ = TimeSeriesDataset(resolution=None, **kwargs).get_data() wi_resolution, _ = TimeSeriesDataset(resolution="10T", **kwargs).get_data() assert len(no_resolution) > len(wi_resolution)
def test_load_series_need_asset_hint(dates, ncs_reader): with pytest.raises(ValueError): for _ in ncs_reader.load_series(dates[0], dates[1], [SensorTag("XYZ-123", None)]): pass path_to_xyz = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "datalake", "gordoplatform") with patch( "gordo.machine.dataset.data_provider.ncs_reader.NcsReader.ASSET_TO_PATH", {"gordoplatform": path_to_xyz}, ): valid_tag_list_with_asset = [SensorTag("XYZ-123", "gordoplatform")] for frame in ncs_reader.load_series(dates[0], dates[1], valid_tag_list_with_asset): assert len(frame) == 20
def test_can_handle_tag_non_supported_asset_with_base_path(ncs_reader): tag = SensorTag("WEIRD-123", "UNKNOWN-ASSET") assert not ncs_reader.can_handle_tag(tag) ncs_reader_with_base = NcsReader(AzureDLFileSystemMock(), dl_base_path="/this/is/a/base/path") assert ncs_reader_with_base.can_handle_tag(tag)
def test_gordo_to_latigo_tag(): name = "some_name" asset = "some_asset" gordo_tag = SensorTag(name=name, asset=asset) assert isinstance(gordo_tag, SensorTag) latigo_tag = _gordo_to_latigo_tag(gordo_tag) assert isinstance(latigo_tag, LatigoSensorTag) assert type(latigo_tag.name) == type(name) assert type(latigo_tag.asset) == type(asset) assert latigo_tag.name == name assert latigo_tag.asset == asset
def test_load_multiple_matches_loads_from_first(self): """When a tag can be read from multiple providers it is the first provider in the list of providers which gets the job""" series_collection = list( load_series_from_multiple_providers( [self.ab_producer, self.containing_b_producer], None, None, [SensorTag("abba", None)], )) self.assertEqual(series_collection[0].name, "ab.*")
def test_load_series_no_data(self, _mocked_method): """load_series will raise ValueError if it does not find any tags""" iroc_reader = IrocReader(client=None, threads=1) with self.assertRaises(ValueError): list( iroc_reader.load_series( train_start_date=isoparse("2018-05-02T01:56:00+00:00"), train_end_date=isoparse("2018-05-03T01:56:00+00:00"), tag_list=[SensorTag("jalla", None)], # Not a tag in the input ))
def test_load_series_checks_date(self): """load_series will raise ValueError if train_end_date<train_start_date""" iroc_reader = IrocReader(client=None, threads=1) with self.assertRaises(ValueError): list( iroc_reader.load_series( train_start_date=isoparse("2018-05-03T01:56:00+00:00"), train_end_date=isoparse("2018-05-02T01:56:00+00:00"), tag_list=[SensorTag("jalla", None)], # Not a tag in the input ))
def _machine(name: str) -> Machine: """ Helper to build a basic Machine, only defining its name """ from gordo.machine.dataset.sensor_tag import SensorTag return Machine.from_config( config={ "name": name, "dataset": { "tag_list": [SensorTag("tag-1", "foo"), SensorTag("tag-2", "foo")], "train_start_date": "2016-01-01T00:00:00Z", "train_end_date": "2016-01-05T00:00:00Z", }, "model": "sklearn.linear_model.LinearRegression", }, project_name="test-project", )
def test_insufficient_data_after_row_filtering(n_samples_threshold, filter_value): """ Test that dataframe after row_filter scenarios raise appropriate InsufficientDataAfterRowFilteringError """ kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), n_samples_threshold=n_samples_threshold, ) with pytest.raises(InsufficientDataAfterRowFilteringError): TimeSeriesDataset(row_filter=f"`Tag 1` < {filter_value}", **kwargs).get_data()
def test_row_filter(): """Tests that row_filter filters away rows""" kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), ) X, _ = TimeSeriesDataset(**kwargs).get_data() assert 577 == len(X) X, _ = TimeSeriesDataset(row_filter="`Tag 1` < 5000", **kwargs).get_data() assert 8 == len(X) X, _ = TimeSeriesDataset(row_filter="`Tag 1` / `Tag 3` < 0.999", **kwargs).get_data() assert 3 == len(X)
def test_load_series_missing_columns_data(self, _mocked_method): """load_series will raise ValueError if there is a single tag it can not find""" iroc_reader = IrocReader(client=None, threads=1) with self.assertRaises(ValueError): list( iroc_reader.load_series( train_start_date=isoparse("2018-05-02T01:56:00+00:00"), train_end_date=isoparse("2018-05-03T01:56:00+00:00"), tag_list=IROC_HAPPY_TAG_LIST + [SensorTag("jalla", None)], # "jalla" is not a tag ))
def test_insufficient_data_after_automatic_filtering(): """ Test that dataframe after row_filter scenarios raise appropriate InsufficientDataError """ kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), n_samples_threshold=84, filter_periods={"filter_method": "median"}, ) with pytest.raises(InsufficientDataError): TimeSeriesDataset(**kwargs).get_data()
def test_metadata_statistics(): """Tests that it works to set aggregation method(s)""" kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), ) # Default aggregation gives no extra columns dataset = TimeSeriesDataset(**kwargs) X, _ = dataset.get_data() assert (83, 3) == X.shape metadata = dataset.get_metadata() assert isinstance(metadata["x_hist"], dict) assert len(metadata["x_hist"].keys()) == 3
def test_insufficient_data_after_known_filter_periods_filtering(): """ Test that dataframe after row_filter scenarios raise appropriate InsufficientDataError """ kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), n_samples_threshold=10, known_filter_periods=[ "~('2017-12-25 07:00:00+00:00' <= index <= '2017-12-29 06:00:00+00:00')" ], ) with pytest.raises(InsufficientDataError): TimeSeriesDataset(**kwargs).get_data()
def test_load_series_need_base_path(ncs_reader, dates): tag = SensorTag("WEIRD-123", "BASE-PATH-ASSET") with pytest.raises(ValueError): for _ in ncs_reader.load_series(dates[0], dates[1], [tag]): pass path_to_weird_base_path_asset = os.path.join( os.path.dirname(os.path.realpath(__file__)), "data", "datalake", "base_path_asset", ) ncs_reader_with_base = NcsReader( AzureDLFileSystemMock(), dl_base_path=path_to_weird_base_path_asset) for tag_series in ncs_reader_with_base.load_series(dates[0], dates[1], [tag]): assert len(tag_series) == 20
def test_join_timeseries_empty_series(value, n_rows, resolution, error): """ Test that empty data scenarios raise appropriate errors """ train_start_date = dateutil.parser.isoparse("2018-01-01 00:00:00+00:00") train_end_date = dateutil.parser.isoparse("2018-01-05 00:00:00+00:00") tag_list = [SensorTag(name=n, asset=None) for n in ["Tag 1", "Tag 2", "Tag 3"]] kwargs = { "train_start_date": train_start_date, "train_end_date": train_end_date, "tag_list": tag_list, "resolution": resolution, "data_provider": MockDataProvider(value=np.NaN, n_rows=n_rows), } with pytest.raises(error): TimeSeriesDataset(**kwargs).get_data()
def dataset(): return RandomDataset( train_start_date="2017-01-01 00:00:00Z", train_end_date="2018-01-01 00:00:00Z", tag_list=[SensorTag("Tag 1", None)], )
def ncs_reader(): return NcsReader(AzureDLFileSystemMock()) @pytest.fixture def dates(): return ( dateutil.parser.isoparse("2000-01-01T08:56:00+00:00"), dateutil.parser.isoparse("2001-09-01T10:01:00+00:00"), ) @pytest.mark.parametrize( "tag_to_check", [normalize_sensor_tags(["TRC-123"])[0], SensorTag("XYZ-123", "1776-TROC")], ) def test_can_handle_tag_ok(tag_to_check, ncs_reader): assert ncs_reader.can_handle_tag(tag_to_check) @pytest.mark.parametrize( "tag_to_check", [SensorTag("TRC-123", None), SensorTag("XYZ-123", "123-XXX")]) def test_can_handle_tag_notok(tag_to_check, ncs_reader): assert not ncs_reader.can_handle_tag(tag_to_check) def test_can_handle_tag_unknow_prefix_raise(ncs_reader): with pytest.raises(ValueError):
def test_can_handle_tag_no_asset(self): iroc_reader = IrocReader(client=None, threads=1) assert not iroc_reader.can_handle_tag(SensorTag("UON_EF.xxx", None))
def test_can_handle_tag_unknown_asset(self): iroc_reader = IrocReader(client=None, threads=1) assert not iroc_reader.can_handle_tag( SensorTag("UON_EF.xxx", "UNKNOWǸ_ASSET"))
def test_can_handle_tag_ok(self): iroc_reader = IrocReader(client=None, threads=1) assert iroc_reader.can_handle_tag(SensorTag("UON_EF.xxx", "UON_EF"))
import unittest from io import StringIO from unittest import mock from dateutil.parser import isoparse # type: ignore from gordo.machine.dataset.data_provider.iroc_reader import IrocReader, read_iroc_file from gordo.machine.dataset.sensor_tag import normalize_sensor_tags from gordo.machine.dataset.sensor_tag import SensorTag IROC_HAPPY_TAG_LIST = [ SensorTag("NINENINE.OPCIS::NNFCDPC01.AI1410J0", "NINENINE"), SensorTag("NINENINE.OPCIS::NNFCDPC01.AI1840C1J0", "NINENINE"), SensorTag("NINENINE.OPCIS::NNFCDPC01.AI1840E1J0", "NINENINE"), ] HAPPY_FROM_TS = isoparse("2018-05-02T01:56:00+00:00") HAPPY_TO_TS = isoparse("2018-05-03T01:56:00+00:00") IROC_MANY_ASSETS_TAG_LIST = [ "NINENINE.OPCIS::NNFCDPC01.AI1410J0", "NINENINE.OPCIS::NNFCDPC01.AI1840C1J0", "NINENINE.OPCIS::NNFCDPC01.AI1840E1J0", "UON_EF.OPCIS::LO006-B1H.PRCASXIN", "UON_EF.OPCIS::LO006-B1H.PRTUBXIN", "UON_EF.OPCIS::LO006-B1H_M1.PRSTAXIN", "UON_EF.OPCIS::LO006-B1H_M1.RTGASDIN", ] IROC_NO_ASSET_TAG_LIST = [ SensorTag("NOT.OPCIS::NNFCDPC01.AI1410J0", "NOT"),