def dataset(): return RandomDataset( train_start_date="2017-12-25 06:00:00Z", train_end_date="2017-12-29 06:00:00Z", tag_list=[SensorTag("Tag 1", None), SensorTag("Tag 2", None)], )
def test_aggregation_methods(): """Tests that it works to set aggregation method(s)""" kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), ) # Default aggregation gives no extra columns X, _ = TimeSeriesDataset(**kwargs).get_data() assert (83, 3) == X.shape # The default single aggregation method gives the tag-names as columns assert list(X.columns) == ["Tag 1", "Tag 2", "Tag 3"] # Using two aggregation methods give a multi-level column with tag-names # on top and aggregation_method as second level X, _ = TimeSeriesDataset(aggregation_methods=["mean", "max"], **kwargs).get_data() assert (83, 6) == X.shape assert list(X.columns) == [ ("Tag 1", "mean"), ("Tag 1", "max"), ("Tag 2", "mean"), ("Tag 2", "max"), ("Tag 3", "mean"), ("Tag 3", "max"), ]
def test_lookup_default(legacy_ncs_lookup: NcsLookup, mock_assets_config, threads_count): tags = [ SensorTag("Ásgarðr", "asset"), SensorTag("tag1", "asset"), SensorTag("tag2", "asset"), SensorTag("tag4", "asset"), SensorTag("tag5", "asset1"), ] result = list( legacy_ncs_lookup.lookup( mock_assets_config, tags, [YearPartition(2019), YearPartition(2020)], threads_count=threads_count, )) assert reduce_tag_locations(result) == { ("Ásgarðr", YearPartition(2019)): ( "path/%C3%81sgar%C3%B0r/%C3%81sgar%C3%B0r_2019.csv", CsvFileType, ), ("tag2", YearPartition(2020)): ( "path/tag2/parquet/tag2_2020.parquet", ParquetFileType, ), ("tag5", YearPartition(2020)): ( "path1/tag5/parquet/tag5_2020.parquet", ParquetFileType, ), }
def test_tag_locations(parquet_file_type): tag = SensorTag("tag1", "asset") location_2020 = Location("path/2020.parquet", parquet_file_type) locations = { YearPartition(2020): location_2020, YearPartition(2018): Location("path/2018.parquet", parquet_file_type), } tag_locations = TagLocations(tag, locations) assert tag_locations.available() assert tag_locations.partitions() == [ YearPartition(2018), YearPartition(2020) ] assert tag_locations.get_location(2020) is location_2020 assert tag_locations.get_location(2019) is None result = list(tag_locations) assert result == [ ( SensorTag(name="tag1", asset="asset"), YearPartition(2018), Location(path="path/2018.parquet", file_type=parquet_file_type), ), ( SensorTag(name="tag1", asset="asset"), YearPartition(2020), Location(path="path/2020.parquet", file_type=parquet_file_type), ), ]
def test_empty_target_tag_list(): app = Flask(__name__) with app.app_context(): g.metadata = { "dataset": { "tag_list": [SensorTag("test", "asset"), "test1"] }, "metadata": { "build_metadata": { "dataset": { "dataset_meta": { "tag_loading_metadata": { "tags": { "test": { "name": "test", "asset": "asset" }, "test1": { "name": "test1", "asset": "asset1" }, } } } } } }, } view = BaseModelView() assert view.target_tags == [ SensorTag("test", "asset"), SensorTag("test1", "asset1"), ]
def get_random_data(): data = { "type": "RandomDataset", "train_start_date": dateutil.parser.isoparse("2017-12-25 06:00:00Z"), "train_end_date": dateutil.parser.isoparse("2017-12-30 06:00:00Z"), "tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], "target_tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], } return data
def test_legacy_to_dict(): dataset = RandomDataset( "2017-12-25 06:00:00Z", "2017-12-29 06:00:00Z", [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], ) config = dataset.to_dict() assert config["type"] == "RandomDataset"
def test_load_series_need_asset_hint(dates, ncs_reader): with pytest.raises(ValueError): for _ in ncs_reader.load_series(dates[0], dates[1], [SensorTag("XYZ-123", None)]): pass valid_tag_list_with_asset = [SensorTag("XYZ-123", "gordoplatform")] for frame in ncs_reader.load_series(dates[0], dates[1], valid_tag_list_with_asset): assert len(frame) == 20
def test_get_dataset_with_full_import(): dataset = _get_dataset({ "type": "gordo_dataset.datasets.RandomDataset", "train_start_date": "2017-12-25 06:00:00Z", "train_end_date": "2017-12-29 06:00:00Z", "tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], }) assert type(dataset) is RandomDataset
def test_load_from_multiple_providers(self): """Two tags, each belonging to different data producers, and both gets loaded""" series_collection = list( load_series_from_multiple_providers( [self.ab_producer, self.containing_b_producer], None, None, [SensorTag("abba", None), SensorTag("cba", None)], )) self.assertEqual(series_collection[0].name, "ab.*") self.assertEqual(series_collection[1].name, ".*b.*")
def test_load_multiple_raises_with_no_matches(self): """If no provider matches a tag then load_series_from_multiple_providers raises a ValueError when the generator is realized""" with self.assertRaises(ValueError): list( load_series_from_multiple_providers( [self.ab_producer, self.containing_b_producer], None, None, [ SensorTag("ab", None), SensorTag("tag_not_matching_any_of_the_regexps", None), ], ))
def test_trigger_tags(): data_provider = MockDataProvider() dataset = TimeSeriesDataset( data_provider=data_provider, tag_list=[ SensorTag("Tag 1", "asset"), SensorTag("Tag 2", "asset"), ], target_tag_list=[ SensorTag("Tag 5", "asset"), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), row_filter="`Tag 3` > 0 & `Tag 4` > 1", asset="asset", ) X, y = dataset.get_data() assert X is not None assert y is not None assert set(data_provider.last_tag_list) == { SensorTag("Tag 1", "asset"), SensorTag("Tag 2", "asset"), SensorTag("Tag 3", "asset"), SensorTag("Tag 4", "asset"), SensorTag("Tag 5", "asset"), } assert set(X.columns.values) == {"Tag 1", "Tag 2"} assert set(y.columns.values) == {"Tag 5"}
def test_lookup_exceptions(legacy_ncs_lookup: NcsLookup, mock_assets_config, threads_count): tags = [ SensorTag("Ásgarðr", "asset"), SensorTag("tag1", "asset"), ] with pytest.raises(ConfigException): list( legacy_ncs_lookup.lookup( mock_assets_config, tags, [YearPartition(2019), YearPartition(2020)], threads_count=threads_count, ))
def test_time_series_no_resolution(): kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), ) no_resolution, _ = TimeSeriesDataset(resolution=None, **kwargs).get_data() wi_resolution, _ = TimeSeriesDataset(resolution="10T", **kwargs).get_data() assert len(no_resolution) > len(wi_resolution)
def test_from_dict_with_empty_type(): train_start_date = datetime(2020, 1, 1, tzinfo=tzutc()) train_end_date = datetime(2020, 3, 1, tzinfo=tzutc()) tag_list = [SensorTag("tag1", "asset"), SensorTag("tag2", "asset")] config = { "train_start_date": train_start_date, "train_end_date": train_end_date, "tag_list": tag_list, } dataset = GordoBaseDataset.from_dict(config) assert type(dataset) is TimeSeriesDataset assert dataset.train_start_date == train_start_date assert dataset.train_end_date == train_end_date assert dataset.tag_list == tag_list
def test_to_dict_build_in(): train_start_date = datetime(2020, 1, 1, tzinfo=tzutc()) train_end_date = datetime(2020, 3, 1, tzinfo=tzutc()) tag_list = [SensorTag("tag1", "asset"), SensorTag("tag2", "asset")] dataset = TimeSeriesDataset( train_start_date=train_start_date, train_end_date=train_end_date, tag_list=tag_list, ) config = dataset.to_dict() assert config["train_start_date"] == "2020-01-01T00:00:00+00:00" assert config["train_end_date"] == "2020-03-01T00:00:00+00:00" assert config["tag_list"] == tag_list assert config["type"] == "TimeSeriesDataset"
def test_tag_dirs_lookup(legacy_ncs_lookup: NcsLookup): tags = [ SensorTag("Ásgarðr", "asset"), SensorTag("tag1", "asset"), SensorTag("tag2", "asset"), SensorTag("tag4", "asset"), ] result = {} for tag, path in legacy_ncs_lookup.tag_dirs_lookup("path", tags): result[tag.name] = path assert result == { "Ásgarðr": "path/%C3%81sgar%C3%B0r", "tag2": "path/tag2", "tag1": None, "tag4": None, }
def test_empty_target_tag_list(): app = Flask(__name__) test_tag = SensorTag("test", "asset") with app.app_context(): g.metadata = {"dataset": {"tag_list": [test_tag]}} view = BaseModelView() assert view.target_tags == [test_tag]
def test_monthly_partition_lookup(default_ncs_lookup: NcsLookup, mock_assets_config): tags = [SensorTag("tag11", "asset")] partitions = [ MonthPartition(2020, 2), MonthPartition(2020, 3), MonthPartition(2020, 4), ] locations_list = list( default_ncs_lookup.lookup(mock_assets_config, tags, partitions)) assert len(locations_list) == 1 locations = locations_list[0] assert locations.partitions() == [ MonthPartition(2020, 2), MonthPartition(2020, 4) ] location_2020_2 = locations.get_location(MonthPartition(2020, 2)) assert location_2020_2 is not None assert location_2020_2.path == "path/tag11/parquet/2020/tag11_202002.parquet" assert isinstance(location_2020_2.file_type, ParquetFileType) assert location_2020_2.partition == MonthPartition(2020, 2) location_2020_4 = locations.get_location(MonthPartition(2020, 4)) assert location_2020_4 is not None assert location_2020_4.path == "path/tag11/parquet/2020/tag11_202004.parquet" assert isinstance(location_2020_4.file_type, ParquetFileType) assert location_2020_4.partition == MonthPartition(2020, 4)
def test_timeseries_dataset_compat(): """ There are accepted keywords in the config file when using type: TimeSeriesDataset which don't actually match the kwargs of the dataset's __init__; for compatibility :func:`gordo_dataset.datasets.compat` should adjust for these differences. """ dataset = TimeSeriesDataset( data_provider=MockDataProvider(), train_start_date="2017-12-25 06:00:00Z", train_end_date="2017-12-29 06:00:00Z", tags=[SensorTag("Tag 1", None)], ) assert dataset.train_start_date == dateutil.parser.isoparse( "2017-12-25 06:00:00Z") assert dataset.train_end_date == dateutil.parser.isoparse( "2017-12-29 06:00:00Z") assert dataset.tag_list == [SensorTag("Tag 1", None)]
def test_can_handle_tag_no_asset(): iroc_reader = IrocReader( storage=None, assets_config=None, threads=1, storage_name="dataplatformdlsprod", ) assert not iroc_reader.can_handle_tag(SensorTag("UON_EF.xxx", None))
def test_can_handle_tag_ok(mock_file_system): iroc_reader = IrocReader( storage=mock_file_system, assets_config=load_assets_config(), threads=1, storage_name="dataplatformdlsprod", ) assert iroc_reader.can_handle_tag(SensorTag("UON_EF.xxx", "UON_EF"))
def test_assets_config_wrong_reader(legacy_ncs_lookup: NcsLookup, mock_assets_config): tags = [ SensorTag("tag4", "asset5"), ] with pytest.raises(ValueError): list( legacy_ncs_lookup.assets_config_tags_lookup( mock_assets_config, tags))
def _machine(name: str) -> Machine: """ Helper to build a basic Machine, only defining its name """ from gordo_dataset.sensor_tag import SensorTag return Machine.from_config( config={ "name": name, "dataset": { "tag_list": [SensorTag("tag-1", "foo"), SensorTag("tag-2", "foo")], "train_start_date": "2016-01-01T00:00:00Z", "train_end_date": "2016-01-05T00:00:00Z", }, "model": {"sklearn.linear_model.LinearRegression": {}}, }, project_name="test-project", )
def test_assets_config_tags_lookup_exceptions(legacy_ncs_lookup: NcsLookup, mock_assets_config): tags = [ SensorTag("Ásgarðr", "asset"), SensorTag("tag10", ""), ] with pytest.raises(ValueError): list( legacy_ncs_lookup.assets_config_tags_lookup( mock_assets_config, tags)) tags = [ SensorTag("Ásgarðr", "asset"), SensorTag("tag10", "asset10"), ] with pytest.raises(ValueError): list( legacy_ncs_lookup.assets_config_tags_lookup( mock_assets_config, tags))
def test_assets_config_tags_lookup(legacy_ncs_lookup: NcsLookup, mock_assets_config): tags = [ SensorTag("Ásgarðr", "asset"), SensorTag("tag1", "asset"), SensorTag("tag2", "asset"), SensorTag("tag4", "asset"), SensorTag("tag5", "asset1"), ] result = list( legacy_ncs_lookup.assets_config_tags_lookup(mock_assets_config, tags)) assert result == [ (SensorTag(name="Ásgarðr", asset="asset"), "path/%C3%81sgar%C3%B0r"), (SensorTag(name="tag2", asset="asset"), "path/tag2"), (SensorTag(name="tag1", asset="asset"), None), (SensorTag(name="tag4", asset="asset"), None), (SensorTag(name="tag5", asset="asset1"), "path1/tag5"), ]
def test_process_metadata(): data_provider = MockDataProvider() dataset = TimeSeriesDataset( data_provider=data_provider, tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), ], target_tag_list=[ SensorTag("Tag 5", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), row_filter="`Tag 3` > 0 & `Tag 4` > 1", process_metadata=False, asset="asset", ) dataset.get_data() assert dataset._metadata == {}
def test_load_multiple_matches_loads_from_first(self): """When a tag can be read from multiple providers it is the first provider in the list of providers which gets the job""" series_collection = list( load_series_from_multiple_providers( [self.ab_producer, self.containing_b_producer], None, None, [SensorTag("abba", None)], )) self.assertEqual(series_collection[0].name, "ab.*")
def test_can_handle_tag_non_supported_asset_with_base_path( ncs_reader, assets_config): tag = SensorTag("WEIRD-123", "UNKNOWN-ASSET") assert not ncs_reader.can_handle_tag(tag) ncs_reader_with_base = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, dl_base_path="/this/is/a/base/path", ) assert ncs_reader_with_base.can_handle_tag(tag)
def test_insufficient_data_after_automatic_filtering(): """ Test that dataframe after row_filter scenarios raise appropriate InsufficientDataError """ kwargs = dict( data_provider=MockDataProvider(), tag_list=[ SensorTag("Tag 1", None), SensorTag("Tag 2", None), SensorTag("Tag 3", None), ], train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"), train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"), n_samples_threshold=84, filter_periods={"filter_method": "median"}, ) with pytest.raises(InsufficientDataError): TimeSeriesDataset(**kwargs).get_data()