def test_with_conflicted_file_types_with_preferable_csv(dates, assets_config): ncs_reader = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, remove_status_codes=[0], lookup_for=["csv"], partition_by="year", ) valid_tag_list = normalize_sensor_tags(["TRC-324"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) tags_series = [v for v in series_gen] assert len(tags_series) == 1 trc_324_series = tags_series[0] # CSV file should be with 1 row assert len(trc_324_series) == 1
def test_parquet_files_lookup(dates, assets_config): ncs_reader = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, remove_status_codes=[0], lookup_for=["yearly_parquet", "csv"], partition_by=PartitionBy.YEAR, ) valid_tag_list = normalize_sensor_tags(["TRC-323"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) tags_series = [v for v in series_gen] assert len(tags_series) == 1 trc_323_series = tags_series[0] assert trc_323_series.name == "TRC-323" assert trc_323_series.dtype.name == "float64" assert len(trc_323_series) == 20
def test_load_series_with_filter_bad_data(dates, remove_status_codes, assets_config): ncs_reader = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, remove_status_codes=remove_status_codes, lookup_for=["yearly_parquet", "csv"], partition_by=PartitionBy.YEAR, ) valid_tag_list = normalize_sensor_tags(["TRC-322"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) # Checks if the bad data from the files under tests/gordo/data_provider/data/datalake/TRC-322 # are filtered out. 20 rows exists, 5 of then have the value 0. n_expected = 15 if remove_status_codes != [] else 20 assert all(len(series) == n_expected for series in series_gen)
def test_monthly_parquet(dates, assets_config): ncs_reader = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, ) valid_tag_list = normalize_sensor_tags(["TRC-325"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) tags_series = [v for v in series_gen] assert len(tags_series) == 1 index = tags_series[0].index assert len(index) == 20 dr1 = pd.date_range(start="2001-05-10T00:00:00+00:00", periods=10, freq="1T") dr2 = pd.date_range(start="2001-06-10T00:00:00+00:00", periods=10, freq="1T") dr = dr1.append(dr2) assert index.equals(dr)
def test_load_series_need_base_path(ncs_reader, dates, assets_config): tag = SensorTag("WEIRD-123", "BASE-PATH-ASSET") with pytest.raises(ValueError): for _ in ncs_reader.load_series(dates[0], dates[1], [tag]): pass path_to_weird_base_path_asset = os.path.join( os.path.dirname(os.path.realpath(__file__)), "data", "datalake", "base_path_asset", ) ncs_reader_with_base = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, dl_base_path=path_to_weird_base_path_asset, lookup_for=["yearly_parquet", "csv"], partition_by=PartitionBy.YEAR, ) for tag_series in ncs_reader_with_base.load_series(dates[0], dates[1], [tag]): assert len(tag_series) == 20