def test_complicated_subdirs(self, ): """Check whether FileSet can find files in subdirectories that contain text and placeholders. """ # The Pinocchio fileset from the cloud toolbox: a folder name contains # normal text and a placeholder: pinocchio = FileSet( join( self.refdir, "pinocchio", "t{year2}{month}{day}", "tm{year2}{month}{day}{hour}{minute}{second}{millisecond}.jpg", ), ) # Find all files: files = list(pinocchio) check = [ FileInfo( join(self.refdir, 'pinocchio', 't171102', 'tm171102132855573.jpg'), [ datetime.datetime(2017, 11, 2, 13, 28, 55, 573000), datetime.datetime(2017, 11, 2, 13, 28, 55, 573000) ], {}), ] assert files == check
def main(): args = get_cmd_line_parser().parse_args() # TODO: Delete this line if you fixed this script print("Please check the code before running this.") #exit() images_path = join( args.root_dir, "{temperature}/m{year2}{month}{day}{hour}{minute}{second}*.jpg") images = FileSet( path=images_path, handler=cloud.pinocchio.ThermalCam(to_temperatures=False), name="Calibration Images", ) # Create the calibration mask. Only a small part of the image will show the # the correct pixel values for the corresponding temperature. The rest will # not see the calibration target. calibration_mask = np.zeros((252, 336)) calibration_mask[115:130, 160:180] = 1 calibration_mask = calibration_mask.astype("bool") # Get the temperatures (normally from a file, but this is not implemented # yet): temperature = get_temperatures() create_calibration_file( images, temperature, calibration_mask, 'pinocchio_calibration_%Y%m%d.csv', )
def test_search(self): """Collocate fake MHS filesets""" fake_mhs1 = FileSet( path=join( self.refdir, "{satname}_mhs_{year}", "{month}", "{day}", "*NSS.MHSX.*.S{hour}{minute}.E{end_hour}{end_minute}.*.h5"), handler=MHS_HDF(), ) fake_mhs2 = fake_mhs1.copy() with TemporaryDirectory() as outdir: collocations = Collocations(path=join( outdir, "{year}-{month}-{day}", "{hour}{minute}{second}-{end_hour}{end_minute}{end_second}"), ) collocations.search([fake_mhs1, fake_mhs2], start="2007", end="2008", max_interval="1h", max_distance="10km")
def collect_spareice(version): spareice_files = FileSet( name="SPAREICE", path=f"/work/um0878/user_data/jmrziglod/spareice/{version}/noaa18/" "{year}/{month}/{day}/{year}{month}{day}_{hour}{minute}{second}-" "{end_hour}{end_minute}{end_second}.nc", max_processes=PROCESSES, placeholder={"version": version}) print("Collect SPARE-ICE...") data_list = spareice_files.map( get_gridded_mean, start=START, end=END, on_content=True, pass_info=True, ) data = xr.concat(data_list, dim="time") #data.to_netcdf(f"data/{version}_SPARE-ICE_{START}.nc") return data
def collect_cloudsat(): cloudsat_files = FileSet( name="2C-ICE", path="/work/um0878/data/cloudsat/2C-ICE.P1_R04/{year}/{doy}/" "{year}{doy}{hour}{minute}{second}_*.hdf.zip", handler=CloudSat(), # Each file of CloudSat covers exactly 5933 seconds. Since we state it # here, the indexing of files is much faster time_coverage="5933 seconds", # Load only the fields that we need: read_args={ "fields": ["ice_water_path"], }, max_threads=15, ) print("Collect 2C-ICE...") data = xr.concat(cloudsat_files[START:END], dim="scnline") data.to_netcdf(f"data/2C-ICE_{START}.nc") return data
not_null.lat, range=[[0, 90], [-90, 90]], bisectrix=False, cmap="density", vmin=1, ) scat.cmap.set_under("w") plt.colorbar(scat) plt.savefig(f"experiments/{experiment}/scnpos_lat_heatmap.png") test_ratio = 0.3 train_data, test_data = train_test_split( bdata, test_size=test_ratio, shuffle=True, random_state=5 ) print(f"Use {int(not_null.lat.size*(1-test_ratio))} points for training") print(f"Use {int(not_null.lat.size*test_ratio)} points for testing") experiments = FileSet("experiments/{experiment}/spareice.json") if plot_all: spareice = SPAREICE( verbose=2, processes=processes, sea_mask_file="data/land_water_mask_5min.png", elevation_file="data/surface_elevation_1deg.nc", ) for parameters in experiments: try: print(f"plot experiment {parameters.attr['experiment']}") spareice.load(parameters) spareice.report( "experiments", parameters.attr["experiment"], test_data ) except:
def __init__( self, images, labels=None, augmentator=None, reader=None, batch_size=None, balance=False, label_encoding='one-hot', yield_mode='both', shuffle=True, random_seed=42, max_workers=None, classes=None, preprocess_input=None, target_size=None, ): """Create an ImageLoader Args: images: Must be either an iterable of image filenames, a path to a directory (e.g. /path/to/images/*.tif) or a path containing the placeholder *{label}* (e.g. /path/to/{label}/*.tif to match /path/to/car/001.tif). In the latter case, you do not have to set the parameter *labels*. labels: This must be given or *images* must contain a placeholder with *{label}* if you want to balance this dataset. Must be an iterable of labels with the same length as *images*. reader: Function to read the images. If None, images will be read by opencv.imread function. Default None. shuffle: Shuffle the dataset once before yielding. Default: True. random_seed: Number to initialize a random state. Default: 42. augmentator: Use your favourite augmentator object. Actively supported are keras, imgaug and Albumentations image augmentators. Can be also set to a function that will be called on each image before yielding it to the model. Default: None classes: Classes which will be encoded in this dataset. batch_size: Size of one batch. Default: 32. balance: Can be either: * *True*: the minority classes are going to be oversampled so they have the same number as the majority class. If this is used, *labels* must be given. * *iterable*: An iterable with the weights for each sample. The sum of all weights should be 1. Default: False. balance_batch: If *True*, all classes appear with equal numbers in each batch. Works only if the number of classes is equal or lower than the batch size. Defult: False. label_encoding: Can be either: * *False*: No encoding. * *one-hot*: 1D numpy array of binary labels * *binary*: Use when you have only two classes. One will be labelled with 0, the other one with 1. ... Default: *one-hot*. yield_mode: Defines what the ImageLoader will yield for each batch: * *both*: Yields inputs and labels (required for training models). * *inputs*: Yields only the inputs. * *labels*: Yields only the labels. Default: *both*. target_size: Set target size of images as a tuple of (height, width) in pixels. Default: None Examples: from ai4eo.preprocessing import ImageLoader from keras.preprocessing.image import ImageDataGenerator keras_augmentator = ImageDataGenerator( featurewise_center=True, featurewise_std_normalization=True, rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, horizontal_flip=True ) data = ImageLoader( '/path/to/images/{label}/*.tif', augmentator=keras_augmentator, ) # Create keras model model = ... model.fit_generator(data, ...) """ if isinstance(images, str): # Let's try to find all images in the given path files = FileSet(images).to_dataframe() images = files.index.values if "label" in files.columns: labels = files['label'].values if labels is not None and len(labels) != len(images): raise ValueError("images and labels must have the same length!") self.images = np.array(images) self.labels = None if labels is None else np.array(labels) self.classes = classes if self.classes is None and self.labels is not None: self.classes = np.unique(self.labels) if self.classes is not None: self.class_indices = { index: label for index, label in enumerate(self.classes) } else: self.class_indices = None if self.labels is not None: if label_encoding == 'one-hot' or label_encoding == 'binary': self.labels = label_binarize(self.labels, classes=self.classes) if label_encoding == 'binary': self.labels = np.squeeze(self.labels) self.yield_mode = yield_mode self.reader = reader self.augmentator = augmentator self.augmentator_type = None if callable(getattr(self.augmentator, "random_transform", None)): self.augmentator_type = 'keras' elif callable(getattr(self.augmentator, "augment_batches", None)): self.augmentator_type = 'imgaug' # elif callable(getattr(self.augmentator, "augment_batches", None)): # self.augmentator_type = 'imgaug' self.batch_size = batch_size or 32 self.preprocess_input = preprocess_input self.target_size = target_size # To make the experiments reproducible: self.random_state = np.random.RandomState(random_seed) self.random_seed = random_seed self.max_workers = max_workers self._indices = list(range(len(self.images))) if shuffle: self.random_state.shuffle(self._indices) if not balance: self._weights = None # Check explicitly for True because iterables could also return True in # a boolean context elif balance is True: # We want to oversample the minority classes, i.e. the set the # weights accordingly (the lower the amount of samples per class, # the higher the weight for them). if self.labels is None: raise ValueError('Cannot balance samples by myself without' 'having any labels! Please set *labels*!') unique_labels, counts = np.unique(labels, return_counts=True) label_counts = pd.Series(counts, index=unique_labels) self._weights = \ 1 / label_counts.loc[labels].values / len(label_counts) else: self._weights = balance self.reset()
def init_filesets(self): if self.filesets is not None: return self.filesets self.filesets = FileSetManager() self.filesets += FileSet( join( self.refdir, "tutorial", "{satellite}", "{year}-{month}-{day}", "{hour}{minute}{second}-{end_hour}{end_minute}{end_second}.nc" ), name="tutorial", ) self.filesets += FileSet( join( self.refdir, "single_file.nc", ), name="single", time_coverage=["2018-01-01", "2018-01-03"], ) def sequence_get_info(file_info, **kwargs): """Small helper function for sequence fileset.""" with open(file_info) as f: file_info.times[0] = datetime.datetime.strptime( f.readline().rstrip(), "Start: %Y-%m-%d %H:%M:%S") file_info.times[1] = datetime.datetime.strptime( f.readline().rstrip(), "End: %Y-%m-%d %H:%M:%S") return file_info self.filesets += FileSet( join( self.refdir, "sequence", "{year}", "{doy}", "sequence*.txt", ), name="sequence-wildcard", handler=FileHandler(info=sequence_get_info, ), info_via="handler", ) self.filesets += FileSet(join( self.refdir, "sequence", "{year}", "{doy}", "sequence{id}.txt", ), handler=FileHandler(info=sequence_get_info, ), name="sequence-placeholder", info_via="both", placeholder={"id": "\d{4}"}) self.filesets += FileSet( join( self.refdir, # NSS.HIRX.NJ.D99127.S0632.E0820.B2241718.WI.gz "regex", "NSS.HIR[XS].{satcode}.D{year2}{doy}.S{hour}" "{minute}.E{end_hour}{end_minute}.B{B}.{station}.gz"), name="regex-HIRS", ) self.filesets["regex-HIRS"].set_placeholders( satcode=".{2}", B="\d{7}", station=".{2}", ) return self.filesets
def test_glob(self): files = FileSet( join(self.refdir, "tutorial", "{satellite}", "*", "*.nc"), placeholder={"satellite": 'SatelliteA'}, ) self._print_files(list(files)) # Sort this after paths rather than times (because the times are all # equal) check = list( sorted([ FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02', '000000-040000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02', '080000-120000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02', '200000-000000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02', '040000-080000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02', '120000-160000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02', '160000-200000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01', '000000-040000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01', '080000-120000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01', '200000-000000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01', '040000-080000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01', '120000-160000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), FileInfo( join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01', '160000-200000.nc'), [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteA'}), ], key=lambda x: x.path)) assert list(sorted(files, key=lambda x: x.path)) == check
"""Create fold files for training and test images """ import numpy as np import pandas as pd from typhon.files import FileSet df = pd.concat([ FileSet( '/scratch-a/jmrziglod/sen2agri/data/malawi_summer/patches/original/{label}/*.png' ).to_dataframe(), FileSet( '/scratch-a/jmrziglod/sen2agri/data/malawi_summer/patches/augmented/{label}/*.png' ).to_dataframe() ]) unique_ids = np.unique(df.id.values) shuffled_ids = np.random.choice(unique_ids, size=len(unique_ids), replace=False) ratio = 0.1 test_ids = shuffled_ids[:int(shuffled_ids.size * ratio)] train_ids = shuffled_ids[int(shuffled_ids.size * ratio):] with open( '/home/jmrziglod/projects/sen2agri/drone-crop-type/folds/malawi_summer/train_test_all_mosaics/train.txt', 'w') as txt_file: txt_file.write("\n".join(df.index[df.id.isin(train_ids)].tolist())) with open( '/home/jmrziglod/projects/sen2agri/drone-crop-type/folds/malawi_summer/train_test_all_mosaics/test.txt', 'w') as txt_file:
START_TIME = "2007" END_TIME = "March 2010" PROCESSES = 12 TRAINING_FILE = "spareice_training_data.nc" # Define a fileset with the files from MHS / NOAA18: mhs = FileSet( name="MHS", path="/work/um0878/data/amsub_mhs_l1c_hdf/AAPP7_13/noaa18" "_mhs_{year}/{month}/{day}/*NSS.MHSX.NN.*." "S{hour}{minute}.E{end_hour}{end_minute}.*.h5", handler=MHS_HDF(), # Load only the fields that we need: read_args={ "fields": [ "Data/btemps", "Geolocation/Satellite_azimuth_angle", "Geolocation/Satellite_zenith_angle", "Geolocation/Solar_azimuth_angle", "Geolocation/Solar_zenith_angle", ] }, ) # Define a fileset with files from CloudSat / 2C-ICE: cloudsat = FileSet( name="2C-ICE", path="/work/um0878/data/cloudsat/2C-ICE.P1_R04/{year}/{doy}/" "{year}{doy}{hour}{minute}{second}_*.hdf.zip", handler=CloudSat(),
END_TIME = "21 Jun 2013" PROCESSES = 1 SAVE_COLLOCATIONS = False version = "typhon" # Define a fileset with the files from MHS / NOAA18: mhs_files = FileSet( name="MHS", path="/work/um0878/data/amsub_mhs_l1c_hdf/AAPP7_13/noaa18" "_mhs_{year}/{month}/{day}/*NSS.MHSX.NN.*." "S{hour}{minute}.E{end_hour}{end_minute}.*.h5", handler=MHS_HDF(), # Load only the fields that we need: read_args={ "fields": [ "Data/btemps", "Geolocation/Satellite_azimuth_angle", "Geolocation/Satellite_zenith_angle", "Geolocation/Solar_azimuth_angle", "Geolocation/Solar_zenith_angle", ] }, ) # Define a fileset with the files from AVHRR / NOAA18: avhrr_files = FileSet( name="AVHRR", path="/work/um0878/user_data/jmrziglod/avhrr_gac_hdf5/noaa18_gac_" "{year}/{month}/{day}/NSS.*." "S{hour}{minute}.E{end_hour}{end_minute}.*.h5",
def load_filesets(config): """Load all filesets into one FileSetManager object Args: config: Dictionary with configuration keys and values. Returns: A FileSetManager object. """ basedir = config["General"]["basedir"] # This FileSetManager can handle all dataset objects: filesets = FileSetManager() ########################################################################### # Pinocchio - FileSets: filesets += FileSet( name="Pinocchio-netcdf", path=os.path.join( config["General"]["basedir"], config["Pinocchio"]["nc_files"], ), max_processes=int(config["General"]["processes"]), ) filesets += FileSet( name="Pinocchio-archive", path=os.path.join(basedir, config["Pinocchio"]["archive_files"]), max_processes=int(config["General"]["processes"]), ) # Load logbook from Pinocchio. This logbook contains time intervals where # the data is corrupted or bad. logbook = None if "logbook" in config["Pinocchio"]: logbook = load_logbook( os.path.join(basedir, config["Pinocchio"]["logbook"])) pinocchio_calibration = os.path.join( config["General"]["basedir"], config["Pinocchio"]["calibration"], ) filesets += FileSet( name="Pinocchio-raw", path=os.path.join( config["General"]["basedir"], os.path.splitext(config["Pinocchio"]["archive_files"])[0], config["Pinocchio"]["files_in_archive"], ), # Set the pinocchio file handler with the calibration file handler=pinocchio.ThermalCam(calibration_file=pinocchio_calibration), max_processes=int(config["General"]["processes"]), # Exclude the time intervals from the logbook when searching for files: exclude=logbook, ) filesets += FileSet( path=os.path.join(basedir, config["Pinocchio"]["stats"]), name="Pinocchio-stats", max_processes=int(config["General"]["processes"]), ) ########################################################################### ########################################################################### # Dumbo - FileSets: filesets += FileSet( name="Dumbo-netcdf", path=os.path.join( config["General"]["basedir"], config["Dumbo"]["nc_files"], ), max_processes=int(config["General"]["processes"]), ) # Load logbook from Dumbo: logbook = None if "logbook" in config["Dumbo"]: logbook = load_logbook( os.path.join(basedir, config["Dumbo"]["logbook"])) filesets += FileSet( name="Dumbo-raw", path=os.path.join( config["General"]["basedir"], config["Dumbo"]["raw_files"], ), handler=dumbo.ThermalCamASCII(), # Since the raw files have no temporal information in their filename, # we have to retrieve it from by their handler. info_via="handler", max_processes=int(config["General"]["processes"]), # Exclude the time intervals from the logbook when searching for files: exclude=logbook, ) filesets += FileSet( path=os.path.join(basedir, config["Dumbo"]["stats"]), name="Dumbo-stats", max_processes=int(config["General"]["processes"]), ) ########################################################################### filesets += FileSet( path=os.path.join(basedir, config["Ceilometer"]["files"]), name="Ceilometer", # Each file covers roughly 24 hours: time_coverage="24 hours", max_processes=int(config["General"]["processes"]), ) filesets += FileSet( path=os.path.join(basedir, config["DShip"]["files"]), handler=metadata.ShipMSM(), name="DShip", max_processes=int(config["General"]["processes"]), ) filesets += FileSet( path=os.path.join(basedir, config["Plots"]["files"]), name="plots", handler=Plotter(), max_processes=int(config["General"]["processes"]), ) return filesets