def update(self, data_set_meta_info: DataSetMetaInfo): data_type = data_set_meta_info.data_type if data_type is None: raise ValueError('Data must have Data Type') if not self.provides_data_type(data_type): raise ValueError('Data Type {} is not provided.'.format(data_type)) if self._contains(data_set_meta_info): return data_set_info = {} if data_set_meta_info.coverage is not None and loads(data_set_meta_info.coverage) is not None: data_set_info['coverage'] = data_set_meta_info.coverage data_set_start_time = None if data_set_meta_info.start_time is not None: data_set_start_time = get_time_from_string(data_set_meta_info.start_time, False) data_set_end_time = None if data_set_meta_info.end_time is not None: data_set_end_time = get_time_from_string(data_set_meta_info.start_time, True) if data_set_start_time is not None and data_set_end_time is not None and data_set_start_time > data_set_end_time: raise ValueError('start time must not be later than end time') if data_set_start_time is not None: data_set_info['start_time'] = data_set_meta_info.start_time if data_set_end_time is not None: data_set_info['end_time'] = data_set_meta_info.end_time data_set_info['data_type'] = data_type data_set_info['name'] = data_set_meta_info.identifier self.data_set_infos['data_sets'].append(data_set_info) self._update_json_file()
def query(self, query_string: str) -> List[DataSetMetaInfo]: roi = self.get_roi_from_query_string(query_string) query_start_time = self.get_start_time_from_query_string(query_string) query_end_time = self.get_end_time_from_query_string(query_string) data_types = self.get_data_types_from_query_string(query_string) data_set_meta_infos = [] for data_set_info in self.data_set_infos['data_sets']: if data_set_info.get('coverage') is not None and roi is not None: data_set_coverage = loads(data_set_info.get('coverage')) if not roi.intersects(data_set_coverage): continue if query_start_time is not None and data_set_info.get('start_time') is not None: data_set_start_time = get_time_from_string(data_set_info.get('start_time'), False) if query_end_time < data_set_start_time: continue if query_end_time is not None and data_set_info.get('end_time') is not None: data_set_end_time = get_time_from_string(data_set_info.get('end_time'), True) if data_set_end_time < query_start_time: continue if data_set_info.get('data_type') in data_types: data_set_meta_info = DataSetMetaInfo(coverage=data_set_info.get('coverage'), start_time=data_set_info.get('start_time'), end_time=data_set_info.get('end_time'), data_type=data_set_info.get('data_type'), identifier=data_set_info.get('name')) data_set_meta_infos.append(data_set_meta_info) return data_set_meta_infos
def get(self, data_set_meta_info: DataSetMetaInfo) -> Sequence[FileRef]: file_refs = [] if os.path.exists(data_set_meta_info.identifier): mime_type = get_mime_type(data_set_meta_info.identifier) file_refs.append( FileRef(data_set_meta_info.identifier, data_set_meta_info.start_time, data_set_meta_info.end_time, mime_type)) return file_refs relative_path = (self.path + self.pattern).replace('//', '/') relative_path = relative_path.replace( '/{}/'.format(_DATA_TYPE_PATTERN), '/{}/'.format(data_set_meta_info.data_type)) if _DAY_PATTERN not in self.pattern and _MONTH_PATTERN not in self.pattern and \ _YEAR_PATTERN not in self.pattern: if os.path.exists(relative_path): file_names = glob.glob(relative_path + '/**', recursive=True) for file_name in file_names: file_name = file_name.replace('\\', '/') if data_set_meta_info.identifier in file_name and \ data_validation.is_valid(file_name, data_set_meta_info.data_type): mime_type = get_mime_type(file_name) file_refs.append( FileRef(file_name, data_set_meta_info.start_time, data_set_meta_info.end_time, mime_type)) return file_refs if data_set_meta_info.start_time is None and data_set_meta_info.end_time is None: mime_type = get_mime_type(relative_path) file_refs.append( FileRef(relative_path, data_set_meta_info.start_time, data_set_meta_info.end_time, mime_type)) return file_refs # todo consider (weird) case when a start time but no end time is given start_time = get_time_from_string(data_set_meta_info.start_time) end_time = get_time_from_string(data_set_meta_info.end_time) time = start_time while time <= end_time: path = relative_path path = path.replace('/{}/'.format(_YEAR_PATTERN), '/{:04d}/'.format(time.year)) path = path.replace('/{}/'.format(_MONTH_PATTERN), '/{:02d}/'.format(time.month)) path = path.replace('/{}/'.format(_DAY_PATTERN), '/{:02d}/'.format(time.day)) time = self._get_next_time_step(time) if not os.path.exists(path): continue file_names = glob.glob(path + '/**', recursive=True) for file_name in file_names: file_name = file_name.replace('\\', '/') if data_set_meta_info.identifier in file_name and \ data_validation.is_valid(file_name, data_set_meta_info.data_type): mime_type = get_mime_type(file_name) file_refs.append( FileRef(file_name, data_set_meta_info.start_time, data_set_meta_info.end_time, mime_type)) return file_refs
def create_kaska_s2_inference_output_files(start_time: Union[str, datetime], end_time: Union[str, datetime], time_step: Union[int, timedelta], forward_models: List[str], output_directory: str, parameters: Optional[List[str]] = None, state_mask: Optional[str] = None, roi: Optional[Union[str, Polygon]] = None, spatial_resolution: Optional[int] = None, roi_grid: Optional[str] = None, destination_grid: Optional[str] = None, ): if type(start_time) is str: start_time = get_time_from_string(start_time) if type(end_time) is str: end_time = get_time_from_string(end_time) if type(time_step) is int: time_step = timedelta(days=time_step) time_grid = [] current_time = start_time while current_time < end_time: time_grid.append(current_time) current_time += time_step time_grid.append(end_time) mask_data_set, untiled_reprojection = _get_mask_data_set_and_reprojection(state_mask, spatial_resolution, roi, roi_grid, destination_grid) model_parameter_names = [] other_logger.info('Assembling model parameter names') for forward_model_name in forward_models: forward_model = get_forward_model(forward_model_name) if forward_model is None: other_logger.warning(f'Could not find forward model {forward_model_name}') continue for variable in forward_model.variables: other_logger.info(f'Checking variable {variable}') if variable not in model_parameter_names: model_parameter_names.append(variable) outfile_names = [] requested_indexes = [] for i, parameter_name in enumerate(model_parameter_names): other_logger.info(f'Checking for {parameter_name}') if parameters is None or parameter_name in parameters: other_logger.info(f'Creating output files for {parameter_name}') requested_indexes.append(i) for time_step in time_grid: time = time_step.strftime('%Y%j') outfile_names.append(f"{output_directory}/{parameter_name}_A{time}.tif") other_logger.info(f'Created output file {parameter_name}') writer = GeoTiffWriter(outfile_names, mask_data_set.GetGeoTransform(), mask_data_set.GetProjection(), mask_data_set.RasterXSize, mask_data_set.RasterYSize, num_bands=None, data_types=None) writer.close()
def put(self, from_url: str, data_set_meta_info: DataSetMetaInfo): # we assume here that it suffices to consider the start time for putting a data set correctly data_type_path = get_data_type_path(data_set_meta_info.data_type, from_url) relative_path = self.path + self.pattern + data_type_path relative_path = relative_path.replace( '/{}/'.format(_DATA_TYPE_PATTERN), '/{}/'.format(data_set_meta_info.data_type)) if _YEAR_PATTERN in relative_path or _MONTH_PATTERN in relative_path or _DAY_PATTERN in relative_path: if data_set_meta_info.start_time is None: raise ValueError( 'Data Set Meta Info is missing required time information') time = get_time_from_string(data_set_meta_info.start_time) relative_path = relative_path.replace('/{}/'.format(_YEAR_PATTERN), '/{:04d}/'.format(time.year)) relative_path = relative_path.replace( '/{}/'.format(_MONTH_PATTERN), '/{:02d}/'.format(time.month)) relative_path = relative_path.replace('/{}/'.format(_DAY_PATTERN), '/{:02d}/'.format(time.day)) if not from_url == relative_path: if os.path.isdir(from_url): if os.path.exists(relative_path): shutil.rmtree(relative_path) shutil.copytree(from_url, relative_path) else: if not os.path.exists(relative_path): os.makedirs(relative_path) shutil.copy(from_url, relative_path) return DataSetMetaInfo(data_set_meta_info.coverage, data_set_meta_info.start_time, data_set_meta_info.end_time, data_set_meta_info.data_type, relative_path)
def add_observations(self, product_observations: ProductObservations, date: str): bands_per_observation = product_observations.bands_per_observation date = get_time_from_string(date) self.dates.append(date) self._observations[date] = product_observations self.bands_per_observation[date] = bands_per_observation
def _notify_copied_to_local(self, data_set_meta_info: DataSetMetaInfo): tile_name = self._get_tile_name(data_set_meta_info.identifier) start_time = data_set_meta_info.start_time start_time_as_datetime = get_time_from_string( data_set_meta_info.start_time) year = start_time_as_datetime.year month = start_time_as_datetime.month day = start_time_as_datetime.day aws_index = self._get_aws_index(data_set_meta_info.identifier) time = get_time_from_string(start_time).strftime('%Y-%m-%d') file_dir = '{0}/{1},{2},{3}/'.format(self._temp_dir, tile_name, time, aws_index) other_file_dir = '{0}/{1}/{2}/{3}/{4}/{5}/{6}/{7}/'.format( self._temp_dir, tile_name[0:2], tile_name[2:3], tile_name[3:5], year, month, day, aws_index) if os.path.exists(file_dir): shutil.rmtree(file_dir) if os.path.exists(other_file_dir): shutil.rmtree(other_file_dir)
def extract_meta_info(self, path: str) -> DataSetMetaInfo: id = path.split('/')[-1] dataset = xarray.open_dataset(path) if 'lat' in dataset.coords and 'lon' in dataset.coords: lat_min = dataset.lat.min().values.item(0) lat_max = dataset.lat.max().values.item(0) lon_min = dataset.lon.min().values.item(0) lon_max = dataset.lon.max().values.item(0) coverage = f'POLYGON(({lon_min} {lat_max}, {lon_max} {lat_max}, {lon_max} {lat_min}, ' \ f'{lon_min} {lat_min}, {lon_min} {lat_max}))' dataset.close() start_time = get_time_from_string( id[17:32]).strftime('%Y-%m-%d %H:%M:%S') end_time = get_time_from_string( id[33:48]).strftime('%Y-%m-%d %H:%M:%S') return DataSetMetaInfo(identifier=id, coverage=coverage, start_time=start_time, end_time=end_time, data_type=DataTypeConstants.S1_SPECKLED)
def create_sar_config_file(temp_dir: str, roi: str, start_time: str, end_time: str, s1_slc_directory: str, s1_grd_directory: str, temporal_filter: str) -> str: config = {'SAR': {}} config['SAR']['input_folder'] = s1_slc_directory config['SAR']['output_folder'] = s1_grd_directory config['SAR']['gpt'] = '/software/snap/bin/gpt' config['SAR']['speckle_filter'] = {'multi_temporal': {'apply': 'yes', 'files': temporal_filter}} minx, miny, maxx, maxy = loads(roi).bounds config['SAR']['region'] = {'ul': {'lat': maxy, 'lon': minx}, 'lr': {'lat': miny, 'lon': maxx}} start_time = get_time_from_string(start_time) if start_time is not None: config['SAR']['year'] = start_time.year else: end_time = get_time_from_string(end_time) if end_time is not None: config['SAR']['year'] = end_time.year config_file_name = '{}/sar_config.yaml'.format(temp_dir) with open(config_file_name, 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) return config_file_name
def _query_wrapped_meta_info_provider(self, query_string: str, local_data_set_meta_infos: List[DataSetMetaInfo]) \ -> List[DataSetMetaInfo]: only_dataset = DataSetMetaInfo( coverage="POLYGON((15 15, 25 15, 25 25, 15 25, 15 15))", start_time="2017-03-11 14:33:00", end_time="2017-03-11 14:45:00", data_type="TYPE_C", identifier="dterftge") if not self.get_roi_from_query_string(query_string).intersects( loads(only_dataset.coverage)): return [] if self.get_start_time_from_query_string( query_string) > get_time_from_string(only_dataset.end_time): return [] if self.get_end_time_from_query_string( query_string) < get_time_from_string(only_dataset.start_time): return [] if 'TYPE_C' not in self.get_data_types_from_query_string(query_string): return [] return [only_dataset]
def _get_bucket_names(data_set_meta_info: DataSetMetaInfo) -> List[str]: start_time = get_time_from_string(data_set_meta_info.start_time) base_bucket_names = _DATA_TYPE_PARAMETER_DICTS[ data_set_meta_info.data_type]['baseBuckets'] bucket_names = [] for base_bucket_name in base_bucket_names: quarter = int(int(start_time.month - 1) / 3) + 1 bucket_name = base_bucket_name.replace('{YYYY}', str(start_time.year)) bucket_name = bucket_name.replace('{q}', str(quarter)) bucket_names.append(bucket_name) return bucket_names
def test_get_data_set_meta_infos_for_tile_description(): parameters = {'path_to_json_file': path_to_json_file} aws_s2_meta_info_provider = AwsS2MetaInfoProviderAccessor.create_from_parameters(parameters) tile_description = TileDescription('30SWJ', BARRAX_TILE) start_time = get_time_from_string('2016-04-01') end_time = get_time_from_string('2016-04-30') data_set_meta_infos = aws_s2_meta_info_provider._get_data_set_meta_infos_for_tile_description(tile_description, start_time, end_time) assert 6 == len(data_set_meta_infos) assert '2016-04-01T10:57:59' == data_set_meta_infos[0].start_time assert '30/S/WJ/2016/4/1/0' == data_set_meta_infos[0].identifier assert '2016-04-04T11:03:11' == data_set_meta_infos[1].start_time assert '30/S/WJ/2016/4/4/0' == data_set_meta_infos[1].identifier assert '2016-04-11T10:57:56' == data_set_meta_infos[2].start_time assert '30/S/WJ/2016/4/11/0' == data_set_meta_infos[2].identifier assert '2016-04-14T11:09:07' == data_set_meta_infos[3].start_time assert '30/S/WJ/2016/4/14/0' == data_set_meta_infos[3].identifier assert '2016-04-21T10:59:16' == data_set_meta_infos[4].start_time assert '30/S/WJ/2016/4/21/0' == data_set_meta_infos[4].identifier assert '2016-04-24T11:09:39' == data_set_meta_infos[5].start_time assert '30/S/WJ/2016/4/24/0' == data_set_meta_infos[5].identifier
def _get_prefix(data_set_meta_info: DataSetMetaInfo): data_type_dict = _DATA_TYPE_PARAMETER_DICTS[ data_set_meta_info.data_type] storage_structure = data_type_dict['storageStructure'] data_time = get_time_from_string(data_set_meta_info.start_time) prefix = storage_structure.replace('{}'.format('YYYY'), '{:04d}'.format(data_time.year)) prefix = prefix.replace('{}'.format('MM'), '{:02d}'.format(data_time.month)) prefix = prefix.replace('{}'.format('DD'), '{:02d}'.format(data_time.day)) for placeholder in data_type_dict['placeholders'].keys(): start = data_type_dict['placeholders'][placeholder]['start'] end = data_type_dict['placeholders'][placeholder]['end'] prefix = prefix.replace(placeholder, data_set_meta_info.identifier[start:end]) return prefix
def _query_wrapped_meta_info_provider(self, query_string: str, local_data_set_meta_infos: List[DataSetMetaInfo]) \ -> List[DataSetMetaInfo]: data_types = self.get_data_types_from_query_string(query_string) if DataTypeConstants.AWS_S2_L1C not in data_types: return [] roi = self.get_roi_from_query_string(query_string) tile_descriptions = self.get_affected_tile_descriptions(roi) start_time = self.get_start_time_from_query_string(query_string) if start_time is None: start_time = get_time_from_string(FIRST_DAY) end_time = self.get_end_time_from_query_string(query_string) if end_time is None: end_time = datetime.now() data_set_meta_infos = [] for tile_description in tile_descriptions: data_set_meta_infos_for_tile = self._get_data_set_meta_infos_for_tile_description( tile_description, start_time, end_time) for data_set_meta_info_for_tile in data_set_meta_infos_for_tile: if not self._is_provided_locally(data_set_meta_info_for_tile, local_data_set_meta_infos): data_set_meta_infos.append(data_set_meta_info_for_tile) return data_set_meta_infos
def _get_file_ref(self, data_set_meta_info: DataSetMetaInfo, bands=None, metafiles=None) -> Optional[FileRef]: """auxiliary method to delimit the number of downloaded files for testing""" if not self._is_valid_identifier(data_set_meta_info.identifier): # consider throwing an exception return None from sentinelhub import AwsTileRequest tile_name = self._get_tile_name(data_set_meta_info.identifier) start_time_as_datetime = get_time_from_string( data_set_meta_info.start_time) time = start_time_as_datetime.strftime('%Y-%m-%d') aws_index = self._get_aws_index(data_set_meta_info.identifier) request = AwsTileRequest(tile=tile_name, time=time, aws_index=aws_index, bands=bands, metafiles=metafiles, data_folder=self._temp_dir) year = start_time_as_datetime.year month = start_time_as_datetime.month day = start_time_as_datetime.day logging.info('Downloading S2 Data from {}-{}-{}'.format( month, day, year)) request.save_data() saved_dir = '{}/{},{}-{:02d}-{:02d},{}/'.format( self._temp_dir, tile_name, year, month, day, aws_index) new_dir = '{0}/{1}/{2}/{3}/{4}/{5}/{6}/{7}/'.format( self._temp_dir, tile_name[0:2], tile_name[2:3], tile_name[3:5], year, month, day, aws_index) copy_tree(saved_dir, new_dir) logging.info('Downloaded S2 Data from {}-{}-{}'.format( month, day, year)) return FileRef(new_dir, data_set_meta_info.start_time, data_set_meta_info.end_time, get_mime_type(new_dir))
def _get_from_wrapped( self, data_set_meta_info: DataSetMetaInfo) -> Sequence[FileRef]: file_refs = [] time = get_time_from_string(data_set_meta_info.start_time) file_url = '{}/{}/{}/{}.{:02d}.{:02d}/{}'.format( _BASE_URL, _PLATFORM, data_set_meta_info.data_type, time.year, time.month, time.day, data_set_meta_info.identifier) request = urllib2.Request(file_url) authorization = base64.encodebytes(str.encode('{}:{}'.format(self._username, self._password))). \ replace(b'\n', b'').decode() request.add_header('Authorization', 'Basic {}'.format(authorization)) remote_file = self._opener.open(request) temp_url = '{}/{}'.format(self._temp_dir, data_set_meta_info.identifier) logging.info('Downloading {}'.format(data_set_meta_info.identifier)) with open(temp_url, 'wb') as temp_file: total_size_in_bytes = int(remote_file.info()['Content-Length']) one_percent = total_size_in_bytes / 100 downloaded_bytes = 0 next_threshold = one_percent length = 1024 buf = remote_file.read(length) while buf: temp_file.write(buf) buf = remote_file.read(length) downloaded_bytes += 1024 if downloaded_bytes > next_threshold: stdout.write('\r{} %'.format( int(next_threshold / one_percent))) stdout.flush() next_threshold += one_percent logging.info('Downloaded {}'.format(data_set_meta_info.identifier)) file_refs.append( FileRef(temp_url, data_set_meta_info.start_time, data_set_meta_info.end_time, get_mime_type(temp_url))) return file_refs
def remove(self, data_set_meta_info: DataSetMetaInfo): # todo test whether this works with aws s2 data too time = get_time_from_string(data_set_meta_info.start_time) relative_path = self.path + self.pattern relative_path = relative_path.replace( '/{}/'.format(_DATA_TYPE_PATTERN), '/{}/'.format(data_set_meta_info.data_type)) relative_path = relative_path.replace('/{}/'.format(_YEAR_PATTERN), '/{:04d}/'.format(time.year)) relative_path = relative_path.replace('/{}/'.format(_MONTH_PATTERN), '/{:02d}/'.format(time.month)) relative_path = relative_path.replace('/{}/'.format(_DAY_PATTERN), '/{:02d}/'.format(time.day)) if os.path.exists(relative_path): file_names = os.listdir(relative_path) for file_name in file_names: if data_set_meta_info.identifier in file_name: os.remove(relative_path + file_name) while not self.path == relative_path and len( os.listdir(relative_path)) == 0: os.rmdir(relative_path) relative_path = relative_path[:relative_path[:relative_path. rfind('/')].rfind('/' )]
def infer_kaska_s2(start_time: Union[str, datetime], end_time: Union[str, datetime], time_step: Union[int, timedelta], datasets_dir: str, forward_models: List[str], output_directory: str, parameters: Optional[List[str]] = None, state_mask: Optional[str] = None, roi: Optional[Union[str, Polygon]] = None, spatial_resolution: Optional[int] = None, roi_grid: Optional[str] = None, destination_grid: Optional[str] = None, tile_index_x: Optional[int] = 0, tile_index_y: Optional[int] = 0, tile_width: Optional[int] = None, tile_height: Optional[int] = None ): if type(start_time) is str: start_time = get_time_from_string(start_time) if type(end_time) is str: end_time = get_time_from_string(end_time) if type(time_step) is int: time_step = timedelta(days=time_step) time_grid = [] current_time = start_time while current_time < end_time: time_grid.append(current_time) current_time += time_step time_grid.append(end_time) if not os.path.exists(output_directory): os.makedirs(output_directory) temp_dir = f'{output_directory}/temp_{tile_index_x}_{tile_index_y}/' if not os.path.exists(temp_dir): os.makedirs(temp_dir) mask_data_set, untiled_reprojection = _get_mask_data_set_and_reprojection(state_mask, spatial_resolution, roi, roi_grid, destination_grid) reprojection = untiled_reprojection tile_mask_data_set = mask_data_set raster_width = mask_data_set.RasterXSize raster_height = mask_data_set.RasterYSize offset_x = 0 offset_y = 0 if tile_width is not None and tile_height is not None: geo_transform = mask_data_set.GetGeoTransform() ulx, xres, xskew, uly, yskew, yres = geo_transform minlrx = ulx + (mask_data_set.RasterXSize * xres) minlry = uly + (mask_data_set.RasterYSize * yres) ulx = ulx + (tile_index_x * tile_width * xres) uly = uly + (tile_index_y * tile_height * yres) lrx = ulx + (tile_width * xres) lry = uly + (tile_height * yres) raster_width = tile_width raster_height = tile_height if (lrx > ulx and lrx > minlrx) or (lrx < ulx and lrx < minlrx): lrx = minlrx raster_width = np.abs((ulx - lrx) / xres) if (lry > uly and lry > minlry) or (lry < uly and lry < minlry): lry = minlry raster_height = np.abs((uly - lry) / yres) offset_x = tile_index_x * tile_width offset_y = tile_index_y * tile_height roi_bounds = (min(ulx, lrx), min(uly, lry), max(ulx, lrx), max(uly, lry)) destination_spatial_reference_system = osr.SpatialReference() projection = mask_data_set.GetProjection() destination_spatial_reference_system.ImportFromWkt(projection) reprojection = Reprojection(roi_bounds, xres, yres, destination_spatial_reference_system) tile_mask_data_set = reprojection.reproject(mask_data_set) elif tile_width is not None or tile_height is not None: logging.warning('To use tiling, parameters tileWidth and tileHeight must be set. Continue without tiling') file_refs = _get_valid_files(datasets_dir) observations_factory = ObservationsFactory() observations_factory.sort_file_ref_list(file_refs) # an observations wrapper to be passed to kafka observations = observations_factory.create_observations(file_refs, reprojection, forward_models) model_parameter_names = [] other_logger.info('Assembling model parameter names') for forward_model_name in forward_models: forward_model = get_forward_model(forward_model_name) if forward_model is None: other_logger.warning(f'Could not find forward model {forward_model_name}') continue for variable in forward_model.variables: other_logger.info(f'Checking variable {variable}') if variable not in model_parameter_names: model_parameter_names.append(variable) outfile_names = [] requested_indexes = [] for i, parameter_name in enumerate(model_parameter_names): other_logger.info(f'Checking for {parameter_name}') if parameters is None or parameter_name in parameters: other_logger.info(f'Creating output files for {parameter_name}') requested_indexes.append(i) for time_step in time_grid: time = time_step.strftime('%Y-%m-%d') outfile_names.append(f"{output_directory}/s2_{parameter_name}_A{time}.tif") other_logger.info(f'Created output file {parameter_name}') writer = GeoTiffWriter(outfile_names, mask_data_set.GetGeoTransform(), mask_data_set.GetProjection(), mask_data_set.RasterXSize, mask_data_set.RasterYSize, num_bands=None, data_types=None) data = [] at_least_one_valid_observation = False for date in observations.dates: granule = observations.read_granule(date) if granule[0] is not None: at_least_one_valid_observation = True break if not at_least_one_valid_observation: logging.info('No valid observations found. Will skip inference.') for j in requested_indexes: for i in range(len(time_grid)): data.append(np.zeros((int(raster_height), int(raster_width)))) else: # todo make this more elaborate when more than one inverter is available approx_inverter = get_inverter("prosail_5paras", "Sentinel2") kaska = KaSKA(observations=observations, time_grid=time_grid, state_mask=tile_mask_data_set, approx_inverter=approx_inverter, output_folder=temp_dir, save_sgl_inversion=False) results = kaska.run_retrieval() for j, sub_data in enumerate(results[1:]): if j in requested_indexes: for i in range(len(time_grid)): data.append(sub_data[i, :, :]) other_logger.info(f'Writing to {offset_x}, {offset_y} with width {raster_width} and height {raster_height}') writer.write(data, raster_width, raster_height, offset_x, offset_y)
def _pm_request_of(request, workdir: str, id: str) -> Dict: template_text = pkg_resources.resource_string( __name__, "resources/pm_request_template.json") pm_request = json.loads(template_text) pm_request['requestName'] = f"{workdir}/{request['name']}" pm_request['requestId'] = id pm_request['productionType'] = _determine_workflow(request) pm_request['data_root'] = workdir pm_request['simulation'] = pm_request['simulation'] == 'True' pm_request['log_dir'] = f'{workdir}/log' pm_request['General']['roi'] = request['roi'] pm_request['General']['start_time'] = \ datetime.datetime.strftime(get_time_from_string(request['timeRange'][0]), '%Y-%m-%d') pm_request['General']['end_time'] = \ datetime.datetime.strftime(get_time_from_string(request['timeRange'][1]), '%Y-%m-%d') pm_request['General']['time_interval'] = request['timeStep'] pm_request['General']['spatial_resolution'] = request['spatialResolution'] pm_request['General']['tile_width'] = 512 pm_request['General']['tile_height'] = 512 num_tiles_x, num_tiles_y = _get_num_tiles_of_request(request, 512, 512) pm_request['General']['num_tiles_x'] = num_tiles_x pm_request['General']['num_tiles_y'] = num_tiles_y pm_request['Inference']['time_interval'] = request['timeStep'] forward_models = [] for model_dict in request['forwardModels']: model = { "name": model_dict["name"], "type": model_dict["type"], "data_type": model_dict["modelDataType"], "required_priors": [prior for prior in model_dict["requiredPriors"]], "output_parameters": [parameter for parameter in model_dict["outputParameters"]] } forward_models.append(model) pm_request['Inference']['forward_models'] = forward_models pm_request['Prior']['output_directory'] = workdir + '/priors' for user_prior_dict in request['userPriors']: if 'mu' in user_prior_dict: pm_request['Prior'][user_prior_dict['name']] = { 'user': { 'mu': user_prior_dict['mu'] } } if 'unc' in user_prior_dict: if 'user' not in pm_request['Prior'][user_prior_dict['name']]: pm_request['Prior'][user_prior_dict['name']]['user'] = {} pm_request['Prior'][user_prior_dict['name']]['user'][ 'unc'] = user_prior_dict['unc'] if 's1TemporalFilter' in request: pm_request['SAR']['speckle_filter']['multi_temporal'][ 'files'] = request['s1TemporalFilter'] (min_lon, min_lat, max_lon, max_lat) = loads(request['roi']).bounds pm_request['SAR']['region']['ul']['lat'] = max_lat pm_request['SAR']['region']['ul']['lon'] = min_lon pm_request['SAR']['region']['lr']['lat'] = min_lat pm_request['SAR']['region']['lr']['lon'] = max_lon pm_request['SAR']['year'] = datetime.datetime.strftime( get_time_from_string(request['timeRange'][0]), '%Y') if 's2ComputeRoi' in request: pm_request['S2-PreProcessing']['compute_only_roi'] = request[ 's2ComputeRoi'] if 'postProcessors' in request: post_processor_list = [] for post_processor_dict in request['postProcessors']: pp_dict = {} pp_dict['name'] = post_processor_dict['name'] pp_dict['type'] = post_processor_dict['type'] pp_dict['input_types'] = [ input_type for input_type in post_processor_dict["inputTypes"] ] pp_dict['indicator_names'] = [ indicator_name for indicator_name in post_processor_dict["indicatorNames"] ] pp_dict['variable_names'] = [ variable_name for variable_name in post_processor_dict["variableNames"] ] post_processor_list.append(pp_dict) pm_request['post_processing']['post_processors'] = post_processor_list return pm_request
def _infer(start_time: Union[str, datetime], end_time: Union[str, datetime], parameter_list: List[str], prior_directory: str, datasets_dir: str, previous_state_dir: str, next_state_dir: str, emulators_directory: Optional[str], forward_models: Optional[List[str]], output_directory: str, state_mask: Optional[str], roi: Optional[Union[str, Polygon]], spatial_resolution: Optional[int], roi_grid: Optional[str], destination_grid: Optional[str]): # we assume that time is derived for one time step; or, to be more precise, for one time period (with no # intermediate time steps). This time step/time period is described by start time and end time. if type(start_time) is str: start_time = get_time_from_string(start_time) if type(end_time) is str: end_time = get_time_from_string(end_time) if not os.path.exists(output_directory): os.makedirs(output_directory) if forward_models is None and emulators_directory is not None: logging.info('Determining forward model name from emulators directory') aux_data_provider = get_aux_data_provider() model_metadata_file = f'{emulators_directory}/metadata.json' if aux_data_provider.assure_element_provided(model_metadata_file): with(open(model_metadata_file, 'r')) as model_file: model_metadata = json.load(model_file) forward_models = [model_metadata['id']] logging.info(f"Determined forward model '{forward_models[0]}' from emulators directory") else: raise FileNotFoundError(f'Could not find {model_metadata_file}') mask_data_set, reprojection = _get_mask_data_set_and_reprojection(state_mask, spatial_resolution, roi, roi_grid, destination_grid) mask = mask_data_set.ReadAsArray().astype(np.bool) geo_transform = mask_data_set.GetGeoTransform() projection = mask_data_set.GetProjection() complete_parameter_list = [] for forward_model_name in forward_models: logging.info(f'Checking for forward model {forward_model_name}') forward_model = get_forward_model(forward_model_name) if forward_model is not None: logging.info(f'Forward model {forward_model_name} found') model_variables = forward_model.variables for model_variable in model_variables: if model_variable not in complete_parameter_list: complete_parameter_list.append(model_variable) else: raise ValueError(f'Could not find {forward_model_name}') output = InferenceWriter(parameter_list, complete_parameter_list, output_directory, start_time, geo_transform, projection, mask.shape[1], mask.shape[0], state_folder=next_state_dir) prior_files = glob.glob(prior_directory + '/*.vrt') inference_prior = InferencePrior('', global_prior_files=prior_files, reference_dataset=mask_data_set) file_refs = _get_valid_files(datasets_dir) observations_factory = ObservationsFactory() observations_factory.sort_file_ref_list(file_refs) # an observations wrapper to be passed to kafka observations = observations_factory.create_observations(file_refs, reprojection, forward_models) p_forecast_inv = None x_forecast = None if previous_state_dir is not None and os.path.exists(previous_state_dir): p_inv_fname = "P_analysis_inv_%s.npz" % start_time.strftime("A%Y%j") p_inv_fname = os.path.join(previous_state_dir, p_inv_fname) if os.path.exists(p_inv_fname): p_forecast_inv = sp.load_npz(p_inv_fname) x_fname = "X_analysis_%s.npz" % start_time.strftime("A%Y%j") x_fname = os.path.join(previous_state_dir, x_fname) if os.path.exists(x_fname): x_forecast = np.load(x_fname)['arr_0'] mask_fname = "state_mask_%s.npz" % start_time.strftime("A%Y%j") mask_fname = os.path.join(previous_state_dir, mask_fname) if os.path.exists(mask_fname): mask = np.load(mask_fname)['arr_0'] if p_forecast_inv is None or x_forecast is None: processed_prior = inference_prior.process_prior(complete_parameter_list, start_time, mask) if x_forecast is None: x_forecast = processed_prior[0] if p_forecast_inv is None: p_forecast_inv = processed_prior[1] mask = processed_prior[2] linear_kalman = LinearKalman(observations, output, mask, create_prosail_observation_operator, complete_parameter_list, state_propagation=propagator, prior=None, linear=False) # Inflation amount for propagation q = np.zeros_like(x_forecast) # todo figure out correct setting if 'lai' in complete_parameter_list: lai_index = complete_parameter_list.index('lai') q[lai_index::len(complete_parameter_list)] = 0.05 linear_kalman.set_trajectory_model() linear_kalman.set_trajectory_uncertainty(q) time_grid = [start_time, end_time] linear_kalman.run(time_grid, x_forecast, None, p_forecast_inv, iter_obs_op=True)
def test_create_observations(): class DummyObservations(ProductObservations): def get_band_data_by_name( self, band_name: str, retrieve_uncertainty: bool = True) -> ObservationData: return ObservationData(observations=np.array([0.5]), uncertainty=sp.lil_matrix((1, 1)), mask=np.array([0]), metadata={}, emulator=None) def get_band_data( self, band_index: int, retrieve_uncertainty: bool = True) -> ObservationData: return ObservationData(observations=np.array([0.5]), uncertainty=sp.lil_matrix((1, 1)), mask=np.array([0]), metadata={}, emulator=None) @property def bands_per_observation(self): return 15 @property def data_type(self): return 'dummy_type' def set_no_data_value(self, band: Union[str, int], no_data_value: float): pass class DummyObservationsCreator(ProductObservationsCreator): DUMMY_PATTERN = 'dfghztm_[0-9]{4}_dvfgbh' DUMMY_PATTERN_MATCHER = re.compile('dfghztm_[0-9]{4}_dvfgbh') @classmethod def can_read(cls, file_ref: FileRef) -> bool: if os.path.exists(file_ref.url): file = open(file_ref.url, 'r') return cls.DUMMY_PATTERN_MATCHER.search(file.name) is not None @classmethod def create_observations( cls, file_ref: FileRef, reprojection: Optional[Reprojection], emulator_folder: Optional[str]) -> ProductObservations: if cls.can_read(file_ref): return DummyObservations() observations_factory = ObservationsFactory() observations_factory.add_observations_creator_to_registry( DummyObservationsCreator()) start_time = '2017-06-04' file_refs = [ FileRef(url=DUMMY_FILE, start_time=start_time, end_time='2017-06-07', mime_type='unknown mime type'), FileRef(url='tzzg', start_time='2017-06-07', end_time='2017-06-10', mime_type='unknown mime type') ] observations_wrapper = observations_factory.create_observations( file_refs, None, '') assert 1, observations_wrapper.get_num_observations() assert 15, observations_wrapper.bands_per_observation(0) start_time = get_time_from_string(start_time) data = observations_wrapper.get_band_data(start_time, 0) assert 1, len(data.observations) assert 0.5, data.observations[0] other_data = observations_wrapper.get_band_data_by_name(start_time, 'name') assert 1, len(other_data.observations) assert 0.5, other_data.observations[0] assert 'dummy_type' == observations_wrapper.get_data_type(start_time)
def _query_wrapped_meta_info_provider(self, query_string: str, local_data_set_meta_infos: List[DataSetMetaInfo]) \ -> List[DataSetMetaInfo]: requested_data_types = [] query_data_types = self.get_data_types_from_query_string(query_string) for supported_data_type in self._supported_data_types: if supported_data_type in query_data_types: requested_data_types.append(supported_data_type) if len(requested_data_types) == 0: return [] roi = self.get_roi_from_query_string(query_string) tile_coverages = [] for v in range(18): for h in range(36): tile_coverage = get_tile_coverage(h, v) if tile_coverage is not None and tile_coverage.intersects(roi): tile_coverages.append((h, v, tile_coverage.wkt)) start_time = self.get_start_time_from_query_string(query_string) if start_time is None: start_time = get_time_from_string(FIRST_DAY) end_time = self.get_end_time_from_query_string(query_string) if end_time is None: end_time = datetime.datetime.now() data_set_meta_infos = [] try: for requested_data_type in requested_data_types: start_doy = start_time.timetuple().tm_yday current_time = start_time - datetime.timedelta( days=(start_doy - _DATA_OFFSETS[requested_data_type]) % _DATA_INTERVALS[requested_data_type]) while current_time < end_time: current_time_str = current_time.strftime( '%Y-%m-%d %H:%M:%S') current_tile_coverages = [] for h, v, tile_coverage in tile_coverages: add_to_current = True for local_data_set_meta_info in local_data_set_meta_infos: if local_data_set_meta_info.coverage == tile_coverage and \ local_data_set_meta_info.start_time == current_time_str: add_to_current = False break if add_to_current: current_tile_coverages.append( (h, v, tile_coverage)) next_time = current_time + datetime.timedelta( days=_DATA_INTERVALS[requested_data_type]) next_time -= datetime.timedelta(seconds=1) if len(current_tile_coverages) > 0: date_dir_url = '{}/{}/{}/{}.{:02d}.{:02d}/'.format( _BASE_URL, _PLATFORM, requested_data_type, current_time.year, current_time.month, current_time.day) date_page = urllib2.urlopen( date_dir_url).read().decode('utf-8') for h, v, tile_coverage in current_tile_coverages: file_regex = '.hdf">{}.A{}{:03d}.h{:02d}v{:02d}.006.*.hdf'. \ format(requested_data_type.split('.')[0], current_time.year, current_time.timetuple().tm_yday, h, v) available_files = re.findall(file_regex, date_page) for file in available_files: current_time_str = current_time.strftime( '%Y-%m-%d %H:%M:%S') logging.info('Found {} data set for {}'.format( requested_data_type, current_time_str)) data_set_meta_infos.append( DataSetMetaInfo( tile_coverage, current_time_str, next_time.strftime( '%Y-%m-%d %H:%M:%S'), requested_data_type, file[6:])) current_time = next_time + datetime.timedelta(seconds=1) except URLError as e: logging.warning( 'Could not access NASA Land Processes Distributed Active Archive Center: {}' .format(e.reason)) return data_set_meta_infos
def get_end_time_from_query_string(query_string: str) -> Optional[datetime]: end_time_as_string = query_string.split(';')[2] return get_time_from_string(end_time_as_string, True)
def get_start_time_from_query_string(query_string: str) -> Optional[datetime]: start_time_as_string = query_string.split(';')[1] return get_time_from_string(start_time_as_string, False)