def cleanup_missing_data(self, sensor): """ This function ensure that incomplete scenes are removed : for a given date, if an input data layer is missing, all other layers are also removed for this date. """ # get all available dates alldates = [] for path in self['input'][sensor]['_names']: alldates += get_param_in_tree(self, ['input', sensor] + path, '_scenes_dates') # make it a set (unique value and allow to compute difference with "-") alldates = set(alldates) # find missing data dates toremove = [] for path in self['input'][sensor]['_names']: data_type = get_param_in_tree(self, ['input', sensor] + path, 'scenes_dates_indexing_method') logging.debug(f'data type in cleanup_missing_data {data_type}') if data_type == 'dynamic': dates_for_this_path = get_param_in_tree( self, ['input', sensor] + path, '_scenes_dates') missing = alldates - set(dates_for_this_path) logging.debug( f'find missing : {path} : {len(dates_for_this_path)} : {len(missing)}' ) if missing: logging.error( f'{sensor} : Removing date {missing} because {path} data is missing' ) toremove += missing fulldates = sorted(list(alldates - set(toremove))) for path in self['input'][sensor]['_names']: data_type = get_param_in_tree(self, ['input', sensor] + path, 'scenes_dates_indexing_method') if data_type == 'dynamic': set_param_in_tree(self, ['input', sensor] + path, '_scenes_dates', value=fulldates) logging.debug( f"{len(fulldates)} dates actually will be used for {sensor} {path}" ) else: dateslist = get_param_in_tree(self, ['input', sensor] + path, '_scenes_dates') dateslist = dateslist * len(fulldates) set_param_in_tree(self, ['input', sensor] + path, '_scenes_dates', value=dateslist) logging.debug( f"{len(dateslist)} dates actually will be used for {sensor} {path}" ) logging.debug( f"Some dates were actually artificially replicated for {sensor} {path}" )
def get_data_per_band(self, sensor, internalkey): """ This function uses the config dictionnary to locate the appropriate code that must be used to read the data. The piece of code to run is located in the folder 'readers'. The function 'get_data_reader()' (which is in readers/__init__.py) is responsible to transform the string "readername" into data_reader_class (code that can read the data) and data_params (list params required by data_reader_class). Then actual data is read and the return value of the get_data function is an object containing the data (along with some logging information). This function is used when each band needs to be read separately. Then it aggregate them together """ # there are multiple values for this layer, one for each band. # Find the paths to the config for each band band_keys = get_param_in_tree(self, ['input', sensor, internalkey], 'band_keys') paths = [['input', sensor, internalkey, band_key] for band_key in band_keys] data_objects = [] for path in paths: # get the reader from the config readername = get_param_in_tree(self, path, 'data_reader_name') data_reader_class, data_params = get_data_reader(readername) # get the actual value of the parameters required by this reader data_params_dict = { p: get_param_in_tree(self, path, p) for p in data_params } data_object = data_reader_class(name=internalkey) # read the data logging.debug( f' using {readername} {path}, and parameters {data_params_dict.keys()}' ) data_object = data_object.load( **{ 'scenes_dates': get_param_in_tree(self, path, '_scenes_dates'), 'xslice': get_param_in_tree(self, [], 'xslice'), 'yslice': get_param_in_tree(self, [], 'yslice'), 'dataloc': get_param_in_tree(self, path, '_dataloc'), **data_params_dict }) data_objects.append(data_object) # now the list "data_objects" contains a list of matrix, let us merge it into one unique matrix # a drawback of moving the data around like this is slightly slower than loading # directly into the final matrix, but this is not a real problem considering # that the moves performed in memory are very fast compared to reading from disk. # the main advantage of this is to simplify the I/O code (and to allow easily a different # configuration for each band if needed) data_object = stack_it(data_objects, sensorname=sensor) logging.info( f'Data loaded {internalkey} : {data_object.values.shape} matrix for sensor {sensor}' ) return data_object
def get_data(self, internalkey, sensor): # loop through all sensors, even multi sensor is not implemented : # dbox['input']['sensors'] should have only one element. logging.debug(f'Reading {internalkey} data for sensor {sensor}') indexing_method = get_param_in_tree(self, ['input', sensor, internalkey], 'band_indexing_method') if indexing_method == 'full': return self.get_data_per_band(sensor, internalkey) elif indexing_method == 'sparse': return self.get_data_sparse(sensor, internalkey) elif indexing_method == 'constant': return self.get_data_all_bands(sensor, internalkey)
def get_checkpoint_data(self, dbox, date, sensor): """ To read previous BRDF parameters estimation for Kalman filter """ inputcheckpoint = dbox['inputcheckpoint'] logging.debug(f'Getting checkpoint from {inputcheckpoint}') xslice = dbox['xslice'] yslice = dbox['yslice'] model_len = dbox['model_len'] n_channels_ref = dbox['n_channels_ref'] if inputcheckpoint and not dbox[f'{sensor}']['startseries']: # If inputcheckpoint is available or if second calculation after spin-off self.current_startseries = False data_reader_class, data_params = get_data_reader( inputcheckpoint['reader']) data_params_dict = { p: get_param_in_tree(inputcheckpoint, [], p) for p in data_params } reader = data_reader_class() reader.load_brdf(**data_params_dict, n_channels_ref=n_channels_ref, model_len=model_len, xslice=xslice, yslice=yslice) else: # no check point, create empty initial state logging.debug('Setting up empty initial brdf') print('Setting up empty initial brdf, often due to error') self.current_startseries = True reader = EmptyBrdf().load_brdf(xslice, yslice, n_channels_ref, model_len) self.quality_in = reader.quality self.age_obs_in = reader.age_obs self.brdf_in = reader.brdf self.covariance_in = reader.covariance self.days_last_in = (date - reader.previous_date).days
def get_data_all_bands(self, sensor, internalkey): """ This function uses the config dictionnary to locate the appropriate code that must be used to read the data. The piece of code to run is located in the folder 'readers'. The function 'get_data_reader()' (which is in readers/__init__.py) is responsible to transform the string "readername" into data_reader_class (code that can read the data) and data_params (list params required by data_reader_class). Then actual data is read and the return value of the get_data function is an object containing the data and some logging information. This function is used when all bands need to be read together """ path = ['input', sensor, internalkey] # get the reader from the config readername = get_param_in_tree(self, path, 'data_reader_name') data_reader_class, data_params = get_data_reader(readername) # get the actual value of the parameters required by this reader data_params_dict = { p: get_param_in_tree(self, path, p) for p in data_params } data_object = data_reader_class(name=internalkey) # read the data logging.debug( f' using {readername} {path}, and parameters {data_params_dict.keys()}' ) data_object = data_object.load( **{ 'scenes_dates': get_param_in_tree(self, path, '_scenes_dates'), 'xslice': get_param_in_tree(self, [], 'xslice'), 'yslice': get_param_in_tree(self, [], 'yslice'), 'dataloc': get_param_in_tree(self, path, '_dataloc'), **data_params_dict }) logging.info( f'Data loaded {internalkey} : {data_object.values.shape} matrix for sensor {sensor}' ) return data_object
def filter_scenes_dates(self, dstore, sensor, outputdate): """ This function filters the scenes dates available from the dstore. It keeps only the dates that are relevant for the current time step and to store them in the newly created DataBox object """ # create a deep copy of the dstore dict because we will delete the information # about files that are irrelevant for the current date. self.update(deepcopy(dstore)) for path in self['input'][sensor][ '_names']: # the variable "path" loops through all input data layers if (path[-1] is 'brdf_clim') or (path[-1] is 'brdf_clim_cov'): continue # We there exclude the BRDF_clim criteria to select the available dates for the calculation of Albedo # The concequence could be that no BRDF clim is found but we continue anyway to process. data_type = get_param_in_tree(self, ['input', sensor] + path, 'scenes_dates_indexing_method') logging.debug( f' in data_box filter_scenes_dates we now treat {path}') # get the data location : filenames for this input layer dataloc = get_param_in_tree(self, ['input', sensor] + path, '_dataloc') # get ALL the dates that are available for this layer availabledates = list(dataloc.keys()) # using the time_span from the config, select only # the relevant dates in relevant_scene_dates time_span_for_composition_in_days = self[ 'time_span_for_composition'] relevant_scene_dates = compute_relevant_scenes_dates( availabledates, outputdate, time_span_for_composition_in_days, [sensor, path]) logging.info( f'{sensor}/{path}: {len(relevant_scene_dates)} dates ({len(availabledates)} available)' ) # remove the dates that are not relevant if data_type == 'dynamic': for toremove in (set(availabledates) - set(relevant_scene_dates)): del dataloc[toremove] elif data_type == 'static': relevant_scene_dates = availabledates logging.debug( f' Length of the relevant_scene_dates is now {len(relevant_scene_dates)} ' ) # and save the list of dates in the DataBox object # to keep the information about their order (even if we can alway reorder them) set_param_in_tree(self, ['input', sensor] + path, '_scenes_dates', value=relevant_scene_dates) set_param_in_tree( self, ['input', sensor] + path, '_scenes_dates.comment', value= f'This list contains all the relevant scene dates to compute output for date ({outputdate})' )
def setup_input_one_sensor(self, inputconfig, sensor): """ Quick parse of all the available data and create dict of how to find data with date for each variable and each file. This function will created dictionaries following the scheme : :param inputconfig: input configuration for one sensor :type inputconfig: dict :rtype: dict of dict of dict date -> { sensor -> scene -> band -> reflectance,etc} -> angle -> lat/lon} See the output file in .yaml format to understand/check the structure of the nested dictionnary """ # make a copy of the config dic because we are going to populate # it an we don't want to change the original input self['input'][sensor] = deepcopy(inputconfig) # The number of bands is infered from the input files : # This is the number of input layers starting with "band" followed by a number band_keys = [k for k in inputconfig['toc_reflectance'].keys() if re.match('^band[0-9]*$', k) ] self['input'][sensor]['n_channels_meas'] = len(band_keys) self['input'][sensor]['band_keys'] = band_keys logging.info(f'Sensor {sensor} : {band_keys}') # the list of input names will be used in data_manager_one_step. self['input'][sensor]['_names'] = [] list_param = ["toc_reflectance", "toc_reflectance_cov", "lwcs_mask", "azimuth_sol", "azimuth_sat" \ , "zenith_sol", "zenith_sat", "latitude", "longitude"] if self.acf['nam_inv']['brdf_clim_activated'] is True: list_param.append('brdf_clim') list_param.append('brdf_clim_cov') for k in list_param: per_band = get_param_in_tree(self, ['input', sensor, k], 'band_indexing_method') if per_band == 'full': self['input'][sensor]['_names'] += [list(i) for i in itertools.product([k], band_keys)] elif per_band == 'sparse': band_keys = get_param_in_tree(self, ['input', sensor, k], 'band_indexing_list') self['input'][sensor]['_names'] += [list(i) for i in itertools.product([k], band_keys)] elif per_band == 'constant': self['input'][sensor]['_names'] += [[k]] self['input'][sensor]['_names'] = tuple(self['input'][sensor]['_names']) # initialize empty cache. cache = {} for sensorpath in self['input'][sensor]['_names']: # for each input data path path = [*sensorpath] # get the code and list of parameters names that are needed to read the data files logging.debug(f'Reading metadata {path}') #if sensorpath == 'brdf_clim':import ipdb; ipdb.set_trace() readername = get_param_in_tree(inputconfig, path, 'dataloc_reader_name') logging.info(f'Reading metadata for path {path}, using {readername}') dataloc_reader_function, required_params, datetime_params = get_dataloc_reader(readername) logging.debug(f'The dataloc_reader_function ({dataloc_reader_function.__doc__}) needs the parameters {datetime_params}, {required_params}') if not inputconfig['use_cache']: required_params_dict = {p:get_param_in_tree(inputconfig, path, p) for p in required_params} dataloc, metadata = dataloc_reader_function(**{'output_dates':self[sensor]['output_dates'], **required_params_dict}) else: # get cache key in case the files have aleady been opened before cache_dict = {p:get_param_in_tree(inputconfig, path, p) for p in datetime_params} cachekey, filecachekey = get_frozen_keys({'dataloc_reader_name':readername, **cache_dict}) cachefile = f'cache/dataloc/{filecachekey}' logging.debug(f'Using cache {path}: f{filecachekey} : {cachekey}') if cachekey in cache: # the (unique) cachekey has been found, reuse the value in cache dataloc = cache[cachekey] elif os.path.exists(cachefile): dataloc, metadata = from_yaml_function(cachefile) cache[cachekey] = dataloc else: # the cache key has not been found, run the actual function to read # data location from the files get the actual values for the required_parameters required_params_dict = {p:get_param_in_tree(inputconfig, path, p) for p in required_params} dataloc, metadata = dataloc_reader_function(**{'output_dates':self[sensor]['output_dates'], **required_params_dict}) # save into the cache cache[cachekey] = dataloc save_yaml([dataloc, metadata], filename = cachefile) # dataloc has been found, set the value in the appropriate place in the nested dictionary set_param_in_tree(self['input'][sensor], path, '_dataloc', value=dataloc) if not 'xoutputsize' in self: # do this only once # set also the sizes of the output self['xoutputsize'] = metadata['xoutputsize'] self['youtputsize'] = metadata['youtputsize'] self['xfullsize'], self['yfullsize'] = self['xoutputsize'], self['youtputsize'] self['xfullslice'] = slice(0,self['xfullsize']) self['yfullslice'] = slice(0,self['yfullsize']) self['.xoutputsize'] = 'Comment: xoutputsize should be the size \ of the output file, xfullslice should be the size of the input file. Currently, they are identical.' self['.xoutputsize.'] = 'When processing only 1/10 pixels or 1/100 pixels, \