def test_vrt_file_system_get(): parameters = { 'path_to_vrt_file': PATH_TO_NON_EXISTENT_VRT_FILE, 'encapsulated_data_type': DataTypeConstants.ASTER, 'accessed_file_system': 'LocalFileSystem', 'path': './test/test_data/', 'pattern': '/dt/' } file_system = VrtFileSystemAccessor.create_from_parameters(parameters) try: data_set_meta_info = DataSetMetaInfo('of no concern here', None, None, DataTypeConstants.ASTER, PATH_TO_NON_EXISTENT_VRT_FILE, 'ASTGTM2_N36W005_dem.tif') file_refs = file_system.get(data_set_meta_info) assert 1 == len(file_refs) assert PATH_TO_NON_EXISTENT_VRT_FILE == file_refs[0].url assert file_refs[0].start_time is None assert file_refs[0].end_time is None assert 'x-world/x-vrt' == file_refs[0].mime_type assert os.path.exists(PATH_TO_NON_EXISTENT_VRT_FILE) finally: if os.path.exists(PATH_TO_NON_EXISTENT_VRT_FILE): os.remove(PATH_TO_NON_EXISTENT_VRT_FILE)
def _get_data_set_meta_infos_for_tile_description( self, tile_description: TileDescription, start_time: datetime, end_time: datetime) -> List[DataSetMetaInfo]: data_set_meta_infos = [] current_time = start_time while current_time < end_time: aws_index = 0 while aws_index >= 0: id = _ID_PATTERN.format(tile_description.tile_id[0:2], tile_description.tile_id[2:3], tile_description.tile_id[3:5], current_time.year, current_time.month, current_time.day, aws_index) tile_info_url = _AWS_BASE_TILE_INFO_URL.format(id) request = requests.get(tile_info_url) if request.status_code == 200: time = json.loads(request.text)['timestamp'][:-5] data_set_meta_infos.append( DataSetMetaInfo(tile_description.coverage, time, time, DataTypeConstants.AWS_S2_L1C, id)) aws_index += 1 else: aws_index = -1 current_time += timedelta(days=1) return data_set_meta_infos
def _create_data_set_meta_info(self, path: str, manifest_file): manifest = XML(manifest_file) coverage = self._extract_coverage(manifest) start_time = self._extract_start_time(manifest) end_time = self._extract_stop_time(manifest) id = path.split('/')[-1] return DataSetMetaInfo(identifier=id, coverage=coverage, start_time=start_time, end_time=end_time, data_type=DataTypeConstants.S1_SLC)
def extract_meta_info(self, path: str) -> DataSetMetaInfo: h = int(path[-27:-25]) v = int(path[-24:-22]) tile_coverage = get_tile_coverage(h, v).wkt year = int(path[-36:-32]) doy = int(path[-32:-29]) start_time = get_time_from_year_and_day_of_year(year, doy) end_time = self._get_end_time(year, doy) return DataSetMetaInfo(tile_coverage, start_time.strftime('%Y-%m-%d %H:%M:%S'), end_time.strftime('%Y-%m-%d %H:%M:%S'), self.name(), path[path.find('MCD'):])
def query_local(self, query_string: str) -> List[DataSetMetaInfo]: if self._provided_data_type not in self.get_data_types_from_query_string(query_string): return [] roi = self.get_roi_from_query_string(query_string) coverages, referenced_data = self._get_coverages_from_local_meta_info_provider() coverage = cascaded_union(coverages) if not roi.within(coverage): return [] referenced_data = ';'.join(referenced_data) data_set_meta_info = DataSetMetaInfo(coverage.wkt, None, None, self._provided_data_type, self._path_to_vrt_file, referenced_data) return [data_set_meta_info]
def extract_meta_info(self, path: str) -> DataSetMetaInfo: path_lat_id = path[-14:-12] path_lat = float(path[-14:-12]) if path_lat_id == 'S': path_lat *= -1 path_lon_id = path[-12:-11] path_lon = float(path[-11:-8]) if path_lon_id == 'W': path_lon *= -1 coverage = Polygon([[path_lon, path_lat], [path_lon, path_lat + 1], [path_lon + 1, path_lat + 1], [path_lon + 1, path_lat]]) return DataSetMetaInfo(coverage.wkt, None, None, DataTypeConstants.ASTER, path)
def test_notify_copied_to_local(): parameters = {'path': TEMP_DIR, 'pattern': '', 'url': EMUS_TEST_URL, 'temp_dir': TEMP_DIR} file_system = HttpFileSystemAccessor.create_from_parameters(parameters) path_to_file = './test/test_data/some_file' try: open(path_to_file, 'w+') data_set_meta_info = DataSetMetaInfo('ctfvgb', '2017-09-04', '2017-09-04', 'some_format', 'some_file') file_system._notify_copied_to_local(data_set_meta_info) assert not os.path.exists(path_to_file) finally: if os.path.exists(path_to_file): os.remove(path_to_file)
def _query_wrapped_meta_info_provider(self, query_string: str, local_data_set_meta_infos: List[DataSetMetaInfo]) -> List[DataSetMetaInfo]: roi = dumps(self.get_roi_from_query_string(query_string)) data_types = self.get_data_types_from_query_string(query_string) start_time = datetime.strftime(self.get_start_time_from_query_string(query_string), "%Y-%m-%dT%H:%M:%SZ") end_time = datetime.strftime(self.get_end_time_from_query_string(query_string), "%Y-%m-%dT%H:%M:%SZ") data_set_meta_infos = [] for data_type in data_types: if self.provides_data_type(data_type): run = 0 continue_checking_for_data_sets = True while continue_checking_for_data_sets: scihub_query = self._create_scihub_query(roi, data_type, start_time, end_time, run) run += 1 response = requests.get(scihub_query, auth=(self._username, self._password)) response_xml = XML(response.content) continue_checking_for_data_sets = False for child in response_xml: if child.tag == '{http://www.w3.org/2005/Atom}entry': data_set_meta_info_id = "" data_set_meta_info_start_time = "" data_set_meta_info_end_time = "" data_set_meta_info_coverage = "" data_set_meta_info_reference = "" for child2 in child: if child2.tag == '{http://www.w3.org/2005/Atom}id': data_set_meta_info_reference = child2.text elif child2.tag == '{http://www.w3.org/2005/Atom}title': data_set_meta_info_id = child2.text elif child2.tag == '{http://www.w3.org/2005/Atom}date' and 'name' in child2.attrib \ and child2.attrib['name'] == 'beginposition': data_set_meta_info_start_time = child2.text elif child2.tag == '{http://www.w3.org/2005/Atom}date' and 'name' in child2.attrib \ and child2.attrib['name'] == 'endposition': data_set_meta_info_end_time = child2.text elif child2.tag == '{http://www.w3.org/2005/Atom}str' and 'name' in child2.attrib \ and child2.attrib['name'] == 'footprint': data_set_meta_info_coverage = child2.text data_set_meta_info = \ DataSetMetaInfo(data_set_meta_info_coverage, data_set_meta_info_start_time, data_set_meta_info_end_time, data_type, data_set_meta_info_id, data_set_meta_info_reference) if not self._is_provided_locally(data_set_meta_info, local_data_set_meta_infos): data_set_meta_infos.append(data_set_meta_info) continue_checking_for_data_sets = True response.close() return data_set_meta_infos
def _query_wrapped_meta_info_provider(self, query_string: str, local_data_set_meta_infos: List[DataSetMetaInfo]) -> \ List[DataSetMetaInfo]: roi = dumps(self.get_roi_from_query_string(query_string)) data_types = self.get_data_types_from_query_string(query_string) start_time = datetime.strftime( self.get_start_time_from_query_string(query_string), "%Y-%m-%dT%H:%M:%SZ") end_time = datetime.strftime( self.get_end_time_from_query_string(query_string), "%Y-%m-%dT%H:%M:%SZ") data_set_meta_infos = [] for data_type in data_types: if self.provides_data_type(data_type): run = 0 continue_checking_for_data_sets = True while continue_checking_for_data_sets: mundi_query = _create_mundi_query(roi, data_type, start_time, end_time, run) run += 1 response = requests.get(mundi_query) response_xml = XML(response.content) continue_checking_for_data_sets = False for child in response_xml: if child.tag == '{http://www.w3.org/2005/Atom}entry': data_set_meta_info_id = "" data_set_meta_info_time = "" data_set_meta_info_coverage = "" for child2 in child: if child2.tag == '{http://www.w3.org/2005/Atom}id': data_set_meta_info_id = child2.text elif child2.tag == '{http://www.georss.org/georss}polygon': data_set_meta_info_coverage = _convert_mundi_coverage( child2.text) elif child2.tag == '{http://tas/DIAS}sensingStartDate': data_set_meta_info_time = child2.text data_set_meta_info = DataSetMetaInfo( data_set_meta_info_coverage, data_set_meta_info_time, data_set_meta_info_time, data_type, data_set_meta_info_id) if not self._is_provided_locally( data_set_meta_info, local_data_set_meta_infos): data_set_meta_infos.append(data_set_meta_info) continue_checking_for_data_sets = True return data_set_meta_infos
def get(self, data_set_meta_info: DataSetMetaInfo) -> Sequence[FileRef]: if data_set_meta_info.referenced_data is None: return [] required_datasets = [] referenced_data_sets = data_set_meta_info.referenced_data.split(';') for data_set in referenced_data_sets: # coverage is wrong here. We leave it as it makes no difference. file_refs = self._file_system.get(DataSetMetaInfo(data_set_meta_info.coverage, None, None, self._encapsulated_data_type, data_set)) for file_ref in file_refs: if file_ref.url not in required_datasets: required_datasets.append(file_ref.url.replace('//', '/')) vrt_dataset = gdal.BuildVRT(self._path_to_vrt_file, required_datasets) vrt_dataset.SetMetadataItem('COVERAGE', data_set_meta_info.coverage) vrt_dataset.FlushCache() self._set_absolute_sources(required_datasets) return [FileRef(self._path_to_vrt_file, None, None, get_mime_type(self._path_to_vrt_file))]
def query(self, query_string: str) -> List[DataSetMetaInfo]: if self._provided_data_type not in self.get_data_types_from_query_string(query_string): return [] roi = self.get_roi_from_query_string(query_string) coverages, referenced_data = self._get_coverages_from_local_meta_info_provider() coverage = cascaded_union(coverages) if not roi.within(coverage): additional_coverages, additional_files = self._get_coverages_from_wrapped_meta_info_provider(query_string) for i in range(len(additional_files)): if additional_files[i] not in referenced_data: referenced_data.append(additional_files[i]) coverages.append(additional_coverages[i]) coverage = cascaded_union(coverages) referenced_data = ';'.join(referenced_data) data_set_meta_info = DataSetMetaInfo(coverage.wkt, None, None, self._provided_data_type, self._path_to_vrt_file, referenced_data) return [data_set_meta_info]
def test_aws_s2_file_system_get_file_ref(): try: parameters = {'temp_dir': OUTPUT_DIR, 'path': './test/test_data/aws_s2_data/', 'pattern': ''} aws_s2_file_system = AwsS2FileSystemAccessor.create_from_parameters(parameters) data_set_meta_info = DataSetMetaInfo('doesnt matter here', '2016-04-01', '2016-04-01', DataTypeConstants.AWS_S2_L1C, '30/S/WJ/2016/4/1/0') metafiles = ['metadata', 'tileInfo'] file_ref = aws_s2_file_system._get_file_ref(data_set_meta_info, bands=[], metafiles=metafiles) assert '{}/30SWJ,2016-04-01,0/30/S/WJ/2016/4/1/0/'.format(OUTPUT_DIR) == file_ref.url assert '2016-04-01' == file_ref.start_time assert '2016-04-01' == file_ref.end_time assert 'application/x-directory' == file_ref.mime_type finally: path = '{}/30SWJ,2016-04-01,0/'.format(OUTPUT_DIR) if os.path.exists(path): shutil.rmtree(path)
def test_notify_copied_to_local(): dir_to_be_deleted = '{}/24CBS,2017-10-16,1/'.format(OUTPUT_DIR) other_dir_to_be_deleted = '{}/24/C/BS/2017/10/16/1/'.format(OUTPUT_DIR) try: parameters = {'temp_dir': OUTPUT_DIR, 'path': './test/test_data/aws_s2_data/', 'pattern': ''} aws_s2_file_system = AwsS2FileSystemAccessor.create_from_parameters(parameters) if not os.path.exists(dir_to_be_deleted): os.mkdir(dir_to_be_deleted) if not os.path.exists(other_dir_to_be_deleted): os.makedirs(other_dir_to_be_deleted) data_set_meta_info = DataSetMetaInfo('something', '2017-10-16', '2017-10-16', 'AWS_S2_L1C', '24/C/BS/2017/10/16/1') aws_s2_file_system._notify_copied_to_local(data_set_meta_info) assert not os.path.exists(dir_to_be_deleted) assert not os.path.exists(other_dir_to_be_deleted) finally: if os.path.exists(dir_to_be_deleted): shutil.rmtree(dir_to_be_deleted)
def extract_meta_info(self, path: str) -> DataSetMetaInfo: id = path.split('/')[-1] dataset = xarray.open_dataset(path) if 'lat' in dataset.coords and 'lon' in dataset.coords: lat_min = dataset.lat.min().values.item(0) lat_max = dataset.lat.max().values.item(0) lon_min = dataset.lon.min().values.item(0) lon_max = dataset.lon.max().values.item(0) coverage = f'POLYGON(({lon_min} {lat_max}, {lon_max} {lat_max}, {lon_max} {lat_min}, ' \ f'{lon_min} {lat_min}, {lon_min} {lat_max}))' dataset.close() start_time = get_time_from_string( id[17:32]).strftime('%Y-%m-%d %H:%M:%S') end_time = get_time_from_string( id[33:48]).strftime('%Y-%m-%d %H:%M:%S') return DataSetMetaInfo(identifier=id, coverage=coverage, start_time=start_time, end_time=end_time, data_type=DataTypeConstants.S1_SPECKLED)
def extract_meta_info(self, path: str) -> DataSetMetaInfo: return DataSetMetaInfo(GLOBAL, None, None, DataTypeConstants.WV_EMULATOR, path)
def _query_wrapped_meta_info_provider(self, query_string: str, local_data_set_meta_infos: List[DataSetMetaInfo]) \ -> List[DataSetMetaInfo]: requested_data_types = [] query_data_types = self.get_data_types_from_query_string(query_string) for supported_data_type in self._supported_data_types: if supported_data_type in query_data_types: requested_data_types.append(supported_data_type) if len(requested_data_types) == 0: return [] roi = self.get_roi_from_query_string(query_string) tile_coverages = [] for v in range(18): for h in range(36): tile_coverage = get_tile_coverage(h, v) if tile_coverage is not None and tile_coverage.intersects(roi): tile_coverages.append((h, v, tile_coverage.wkt)) start_time = self.get_start_time_from_query_string(query_string) if start_time is None: start_time = get_time_from_string(FIRST_DAY) end_time = self.get_end_time_from_query_string(query_string) if end_time is None: end_time = datetime.datetime.now() data_set_meta_infos = [] try: for requested_data_type in requested_data_types: start_doy = start_time.timetuple().tm_yday current_time = start_time - datetime.timedelta( days=(start_doy - _DATA_OFFSETS[requested_data_type]) % _DATA_INTERVALS[requested_data_type]) while current_time < end_time: current_time_str = current_time.strftime( '%Y-%m-%d %H:%M:%S') current_tile_coverages = [] for h, v, tile_coverage in tile_coverages: add_to_current = True for local_data_set_meta_info in local_data_set_meta_infos: if local_data_set_meta_info.coverage == tile_coverage and \ local_data_set_meta_info.start_time == current_time_str: add_to_current = False break if add_to_current: current_tile_coverages.append( (h, v, tile_coverage)) next_time = current_time + datetime.timedelta( days=_DATA_INTERVALS[requested_data_type]) next_time -= datetime.timedelta(seconds=1) if len(current_tile_coverages) > 0: date_dir_url = '{}/{}/{}/{}.{:02d}.{:02d}/'.format( _BASE_URL, _PLATFORM, requested_data_type, current_time.year, current_time.month, current_time.day) date_page = urllib2.urlopen( date_dir_url).read().decode('utf-8') for h, v, tile_coverage in current_tile_coverages: file_regex = '.hdf">{}.A{}{:03d}.h{:02d}v{:02d}.006.*.hdf'. \ format(requested_data_type.split('.')[0], current_time.year, current_time.timetuple().tm_yday, h, v) available_files = re.findall(file_regex, date_page) for file in available_files: current_time_str = current_time.strftime( '%Y-%m-%d %H:%M:%S') logging.info('Found {} data set for {}'.format( requested_data_type, current_time_str)) data_set_meta_infos.append( DataSetMetaInfo( tile_coverage, current_time_str, next_time.strftime( '%Y-%m-%d %H:%M:%S'), requested_data_type, file[6:])) current_time = next_time + datetime.timedelta(seconds=1) except URLError as e: logging.warning( 'Could not access NASA Land Processes Distributed Active Archive Center: {}' .format(e.reason)) return data_set_meta_infos
def extract_meta_info(self, path: str) -> DataSetMetaInfo: relative_path = get_relative_path(path, DataTypeConstants.CAMS_TIFF) return DataSetMetaInfo(GLOBAL, relative_path.replace('_', '-'), relative_path.replace('_', '-'), DataTypeConstants.CAMS_TIFF, relative_path)
def extract_meta_info(self, path: str) -> DataSetMetaInfo: return DataSetMetaInfo(GLOBAL, path[-13:-3], path[-13:-3], DataTypeConstants.CAMS, path)
def extract_meta_info(self, path: str) -> DataSetMetaInfo: coverage = self._extract_coverage(path) start_time = self._extract_start_time(path) end_time = self._extract_end_time(path) return DataSetMetaInfo(coverage, start_time, end_time, self.name(), path)
def extract_meta_info(self, path: str) -> DataSetMetaInfo: coverage = self._extract_coverage(path) time = self._extract_time_from_metadata_file(path) return DataSetMetaInfo(coverage, time, time, self.name(), path)