def test_load_metadata_datatime(self): loader = MetadataLoader(CATALOG_PATH) metadata = loader.load(A_STATION, A_STATION_COORDINATE) expected_data = datetime(2010, 1, 1, 8, 0, 0, 0) self.assertEqual(expected_data, next(metadata).datetime)
def test_load_metadata_image_offset_with_no_compression(self): loader = MetadataLoader(CATALOG_PATH) metadata = loader.load(A_STATION, A_STATION_COORDINATE, compression=None, night_time=False) actual = next(metadata).image_offsets[0] self.assertAlmostEqual(actual, 0)
def test_givenTargetDatetimes_whenLoad_shouldLoadMetadataInOrder(self): loader = MetadataLoader(CATALOG_PATH) metadata = loader.load(Station.BND, A_STATION_COORDINATE, target_datetimes=SOME_TARGET_DATETIMES) for md, expected_datetime in zip(metadata, SOME_TARGET_DATETIMES): self.assertEqual(md.datetime, expected_datetime)
def test_load_metadata_coodinates(self): loader = MetadataLoader(CATALOG_PATH) station = Station.BND coordinates = Coordinates(*STATION_COORDINATES[station]) metadata = loader.load(station, coordinates, night_time=False) actual_coordinates = next(metadata).coordinates self.assertEqual(coordinates, actual_coordinates)
def test_load_metadata_target_cloudiness_6hour(self): loader = MetadataLoader(CATALOG_PATH) station_with_target = Station.BND target_6h = "variable" metadata = loader.load(station_with_target, A_STATION_COORDINATE) actual_target_6h = next(metadata).target_cloudiness_6h self.assertEqual(target_6h, actual_target_6h)
def test_load_metadata_target_cloudiness(self): loader = MetadataLoader(CATALOG_PATH) station_with_target = Station.BND target = "night" metadata = loader.load(station_with_target, A_STATION_COORDINATE) actual_target: Any = next(metadata).target_cloudiness self.assertAlmostEqual(target, actual_target)
def test_load_metadata_target_ghi_6hour(self): loader = MetadataLoader(CATALOG_PATH) station_with_target = Station.BND target_6h = 29.10666666666667 metadata = loader.load(station_with_target, A_STATION_COORDINATE) actual_target_6h: Any = next(metadata).target_ghi_6h self.assertAlmostEqual(target_6h, actual_target_6h)
def test_load_metadata_target_ghi_(self): loader = MetadataLoader(CATALOG_PATH) station_with_target = Station.BND target = -3.986666666666666 metadata = loader.load(station_with_target, A_STATION_COORDINATE) actual_target: Any = next(metadata).target_ghi self.assertAlmostEqual(target, actual_target)
def test_load_metadata_image_path_with_16bit_compression(self): loader = MetadataLoader(CATALOG_PATH) metadata = loader.load(A_STATION, A_STATION_COORDINATE, compression="16bit") first_image_path = next(metadata).image_paths[0] self.assertTrue("16bit" in first_image_path)
def test_load_metadata_with_specified_dataframe(self): dummy_catalog = pickle.load( open("tests/data/samples/catalog-test.pkl", "rb")) loader = MetadataLoader(file_name=None, dataframe=dummy_catalog) station_with_target = Station.BND target_6h = 29.10666666666667 metadata = loader.load(station_with_target, A_STATION_COORDINATE) actual_target_6h: Any = next(metadata).target_ghi_6h self.assertAlmostEqual(target_6h, actual_target_6h)
def prepare_dataloader( dataframe: pd.DataFrame, target_datetimes: typing.List[datetime.datetime], station: str, coordinates: typing.Tuple[float, float, float], target_time_offsets: typing.List[datetime.timedelta], config: dataloader.DataloaderConfig, ) -> tf.data.Dataset: """Output data. Note that you can use either the netCDF or HDF5 data. Each iteration over your data loader should return a 2-element tuple containing the tensor that should be provided to the model as input, and the target values. In this specific case, you will not be able to provide the latter since the dataframe contains no GHI, and we are only interested in predictions, not training. Therefore, you must return a placeholder (or ``None``) as the second tuple element. Reminder: the dataframe contains imagery paths for every possible timestamp requested in ``target_datetimes``. However, we expect that you will use some of the "past" imagery (i.e. imagery at T<=0) for any T in ``target_datetimes``, but you should NEVER rely on "future" imagery to generate predictions (for T>0). We will be inspecting data loader implementations to ensure this is the case, and those who "cheat" will be dramatically penalized. See https://github.com/mila-iqia/ift6759/tree/master/projects/project1/evaluation.md for more information. Args: dataframe: a pandas dataframe that provides the netCDF file path (or HDF5 file path and offset) for all relevant timestamp values over the test period. target_datetimes: a list of timestamps that your data loader should use to provide imagery for your model. The ordering of this list is important, as each element corresponds to a sequence of GHI values to predict. By definition, the GHI values must be provided for the offsets given by ``target_time_offsets`` which are added to each timestamp (T=0) in this datetimes list. station: station name of interest coordinates: station's coordinates (latitude, longitude, elevation). During evaluation time, it will only be one station to avoid confusions. See comment on function `generate_all_predictions` with the for loop. target_time_offsets: the list of timedeltas to predict GHIs for (by definition: [T=0, T+1h, T+3h, T+6h]). config: configuration for the dataloader. Returns: A ``tf.data.Dataset`` object that can be used to produce input tensors for your model. One tensor must correspond to one sequence of past imagery data. The tensors must be generated in the order given by ``target_sequences``. """ logger.info(f"Prepare dataloader for station {station} and config {config}") metadata_loader = MetadataLoader(dataframe=dataframe, training=False) metadata_generator = metadata_loader.load( Station(station), Coordinates(coordinates[0], coordinates[1], coordinates[2]), target_datetimes=target_datetimes, skip_missing=False, num_images=config.num_images, time_interval_min=config.time_interval_min, ) return dataloader.create_dataset( lambda: metadata_generator, config=config, enable_image_cache=False, )
def test_load_metadata_with_night_time(self): loader = MetadataLoader(CATALOG_PATH) metadata = loader.load(A_STATION, A_STATION_COORDINATE, night_time=True) num_nigh_time = self._night_time(metadata) self.assertEqual(NUM_METADATA - NUM_METADATA_BND_DAY_TIME, num_nigh_time)
def test_givenTargetDatetimes_whenLoad_shouldLoadSameAmountOfMetadata( self): loader = MetadataLoader(CATALOG_PATH) metadata = loader.load(Station.BND, A_STATION_COORDINATE, target_datetimes=SOME_TARGET_DATETIMES) self.assertEqual(self._num_metadata(metadata), len(SOME_TARGET_DATETIMES))
def __init__(self): loader = MetadataLoader(CATALOG_PATH) config = default_config() config.error_strategy = dataloader.ErrorStrategy.ignore config.features = [dataloader.Feature.target_ghi] self.dataset = dataloader.create_dataset( lambda: loader.load(STATION, COORDINATES, skip_missing=False), config=config)
def test_load_metadata_image_offset_with_16bit_compression(self): loader = MetadataLoader(CATALOG_PATH) metadata = loader.load(A_STATION, A_STATION_COORDINATE, compression="16bit", night_time=False) actual = next(metadata) while actual.night_time: actual = next(metadata) self.assertAlmostEqual(actual.image_offsets[0], 22)
def test_load_metadata_target_datetimes(self): loader = MetadataLoader(CATALOG_PATH) target_datetimes = [ datetime(2010, 6, 19, 22, 15), # Only test timestamp that have images. datetime(2012, 3, 24, 12), datetime(2015, 9, 21, 21, 15), datetime(2012, 7, 6, 18), datetime(2014, 7, 13), datetime(2010, 8, 31, 20, 45), datetime(2015, 4, 16, 12, 45), datetime(2013, 4, 17, 16), datetime(2012, 8, 15), datetime(2010, 11, 14, 19, 15), datetime(2014, 7, 21, 14, 30), datetime(2011, 11, 22, 17, 30), datetime(2010, 8, 15, 23), datetime(2010, 5, 11, 19), datetime(2013, 2, 15, 14, 15), datetime(2011, 2, 8, 17, 45), ] target_offsets = [ 57, 16, 53, 40, 64, 51, 19, 32, 64, 45, 26, 38, 60, 44, 25, 39, ] metadata = loader.load( A_STATION, A_STATION_COORDINATE, night_time=True, target_datetimes=target_datetimes, ) i = 0 for datapoint in metadata: self.assertIsInstance(datapoint.image_offsets[0], int) self.assertEqual(datapoint.image_offsets[0], target_offsets[i]) i = i + 1 self.assertEqual(len(target_datetimes), i)
def test_givenNumImagesAndTimeInterval_whenLoad_shouldReturnCorrectClearskyValues( self, ): loader = MetadataLoader(CATALOG_PATH) num_images = 5 num_clearsky = 4 metadata = loader.load( Station.BND, A_STATION_COORDINATE, num_images=num_images, ) for i in range(1, num_images + 1): mt = next(metadata) self.assertEqual(num_images, len(mt.clearsky_values)) self.assertEqual(num_clearsky, len(mt.clearsky_values[0]))
def test_load_metadata_compression(self): loader = MetadataLoader(CATALOG_PATH) metadata = loader.load(A_STATION, A_STATION_COORDINATE, night_time=True, compression="8bit") actual: Any = next(metadata).image_compression self.assertEqual(actual, "8bit") metadata = loader.load(A_STATION, A_STATION_COORDINATE, night_time=True, compression="16bit") actual: Any = next(metadata).image_compression self.assertEqual(actual, "16bit")
def test_givenNumImagesAndTimeInterval_whenLoad_shouldReturnCorrectOffsets( self): loader = MetadataLoader(CATALOG_PATH) num_images = 5 time_interval_min = 15 metadata = loader.load( Station.BND, A_STATION_COORDINATE, num_images=num_images, time_interval_min=time_interval_min, ) for i in range(1, num_images + 1): expected_offset = (num_images - i) * [0] + list(range(i)) mt = next(metadata) self.assertEqual(expected_offset, mt.image_offsets)
class MetadataPerf(object): def __init__(self): self.loader = MetadataLoader(CATALOG_PATH) def run(self): metadata = self.loader.load(STATION, COORDINATES, skip_missing=False) for i, m in enumerate(metadata): if i % 100 == 0: print(f"Loaded {i} metadata")
def test_givenNumImagesAndTimeInterval_whenLoad_shouldReturnCorrectPaths( self): loader = MetadataLoader(CATALOG_PATH) num_images = 5 first_day_image_path = ( "/project/cq-training-1/project1/data/hdf5v7_8bit/2010.01.01.0800.h5" ) metadata = loader.load( Station.BND, A_STATION_COORDINATE, num_images=num_images, ) for i in range(1, num_images + 1): expected_path = (num_images - i) * ["/unknow/path" ] + i * [first_day_image_path] mt = next(metadata) self.assertEqual(expected_path, mt.image_paths)
def test_load_metadata(self): metadata_loader = MetadataLoader(file_name=CATALOG_PATH) timestamps = metadata_loader.catalog.index.tolist() datetimes = [timestamp.to_pydatetime() for timestamp in timestamps] metadata = train.metadata_station(metadata_loader, datetimes, 1, IMAGE_INTERVAL_MIN) for m in metadata(): self.assertTrue(isinstance(m, Metadata)) break
def load_data( file_name=None, batch_size=64, night_time=False, skip_missing=True, config=default_config(), skip_non_cached=False, ) -> Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]: """Load train, valid and test datasets. Return: (train_dataset, valid_dataset, test_dataset) """ if file_name is None: file_name = env.get_catalog_path() if env.run_local: config.local_path = env.get_local_data_path() + "/hdf5v7_8bit" # Both concepts are equivalent. If we force caching, we need to skip non cached images. config.force_caching = skip_non_cached train_datetimes, valid_datetimes, test_datetimes = split.load() random.shuffle(train_datetimes) random.shuffle(valid_datetimes) random.shuffle(test_datetimes) ratio_train_datetimes = int(len(train_datetimes) * config.ratio) ratio_valid_datetimes = int(len(valid_datetimes) * config.ratio) ratio_test_datetimes = int(len(test_datetimes) * config.ratio) logger.info(f"Loading {config.ratio*100}% of the data") logger.info(f"Training dataset has {ratio_train_datetimes} datetimes") logger.info(f"Validation dataset has {ratio_valid_datetimes} datetimes") logger.info(f"Test dataset has {ratio_test_datetimes} datetimes") logger.info(f"Using {len(STATION_COORDINATES)} stations") train_datetimes = train_datetimes[:ratio_train_datetimes] valid_datetimes = valid_datetimes[:ratio_valid_datetimes] test_datetimes = test_datetimes[:ratio_test_datetimes] if dataloader.Feature.metadata in config.features: config.precompute_clearsky = True target_datetimes = train_datetimes + valid_datetimes + test_datetimes config.target_datetimes = target_datetimes config.stations = STATION_COORDINATES metadata_loader = MetadataLoader(file_name=file_name) metadata_train = metadata_station( metadata_loader, train_datetimes, config.num_images, config.time_interval_min, night_time=night_time, skip_missing=skip_missing, ) metadata_valid = metadata_station( metadata_loader, valid_datetimes, config.num_images, config.time_interval_min, night_time=night_time, skip_missing=skip_missing, ) metadata_test = metadata_station( metadata_loader, test_datetimes, config.num_images, config.time_interval_min, night_time=night_time, skip_missing=skip_missing, ) dataset_train = dataloader.create_dataset(metadata_train, config, train_datetimes, STATION_COORDINATES) dataset_valid = dataloader.create_dataset(metadata_valid, config, valid_datetimes, STATION_COORDINATES) dataset_test = dataloader.create_dataset(metadata_test, config, test_datetimes, STATION_COORDINATES) logger.info("Loaded datasets.") return dataset_train, dataset_valid, dataset_test
def __init__(self): self.loader = MetadataLoader(CATALOG_PATH)