def test_archived_on_disk(test_dataset: DatasetForTests, integration_test_data: Path): # type: (Tuple[Collection, DatasetLite, str, Path]) -> None """ A an already-archived dataset on disk. Should report it, but not touch the file (trash_archived is false) """ # archived_on_disk = DatasetLite(on_disk.id, archived_time=(datetime.utcnow() - timedelta(days=5))) test_dataset.add_to_index() test_dataset.archive_in_index() archived_time = test_dataset.get_index_record().archived_time assert uri_to_local_path(test_dataset.uri).exists(), "On-disk location should exist before test begins." _check_sync( collection=test_dataset.collection, expected_paths=[ test_dataset.uri ], expected_mismatches=[ mm.ArchivedDatasetOnDisk(DatasetLite(test_dataset.dataset.id, archived_time), test_dataset.uri), ], expected_index_result={ # Not active in index, as it's archived. # on_disk: (on_disk_uri,), # But the parent dataset still is: test_dataset.parent: (), }, cache_path=integration_test_data, fix_settings=dict(index_missing=True, update_locations=True) ) assert uri_to_local_path(test_dataset.uri).exists(), "On-disk location shouldn't be touched"
def test_uri_to_local_path(): if os.name == 'nt': assert 'C:\\tmp\\test.tmp' == str(uri_to_local_path('file:///C:/tmp/test.tmp')) else: assert '/tmp/something.txt' == str(uri_to_local_path('file:///tmp/something.txt')) assert uri_to_local_path(None) is None with pytest.raises(ValueError): uri_to_local_path('ftp://example.com/tmp/something.txt')
def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]: """ Compare the index and filesystem contents for the given uris, yielding Mismatches of any differences. """ # pylint: disable=protected-access index = Index(PostgresDb(PostgresDb._create_engine(index_url))) def ids(datasets): return [d.id for d in datasets] path = uri_to_local_path(uri) log = _LOG.bind(path=path) log.debug("index.get_dataset_ids_for_uri") indexed_datasets = set(get_datasets_for_uri(index, uri)) datasets_in_file = set() # type: Set[DatasetLite] if path.exists(): try: datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path))) except InvalidDocException as e: # Should we do something with indexed_datasets here? If there's none, we're more willing to trash. log.info("invalid_path", error_args=e.args) yield UnreadableDataset(None, uri) return log.info("dataset_ids", indexed_dataset_ids=ids(indexed_datasets), file_ids=ids(datasets_in_file)) if validate_data: validation_success = validate.validate_dataset(path, log=log) if not validation_success: yield InvalidDataset(None, uri) return for indexed_dataset in indexed_datasets: # Does the dataset exist in the file? if indexed_dataset in datasets_in_file: if indexed_dataset.is_archived: yield ArchivedDatasetOnDisk(indexed_dataset, uri) else: yield LocationMissingOnDisk(indexed_dataset, uri) # For all file ids not in the index. file_ds_not_in_index = datasets_in_file.difference(indexed_datasets) if not file_ds_not_in_index: log.info("no mismatch found (dataset already indexed)") for dataset in file_ds_not_in_index: # If it's already indexed, we just need to add the location. indexed_dataset = index.datasets.get(dataset.id) if indexed_dataset: log.info("location_not_indexed", indexed_dataset=indexed_dataset) yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri) else: log.info("dataset_not_index", dataset=dataset, uri=uri) yield DatasetNotIndexed(dataset, uri)
def local_path(self): """ A path to this dataset on the local filesystem (if available). :rtype: pathlib.Path """ return uri_to_local_path(self.local_uri)
def test_move_on_disk(test_dataset: DatasetForTests, integration_test_data: Path, other_dataset: DatasetForTests): # type: (Tuple[Collection, DatasetLite, str, Path]) -> None """ Indexed dataset was moved over the top of another indexed dataset """ test_dataset.add_to_index() other_dataset.add_to_index() shutil.move(other_dataset.path, str(uri_to_local_path(test_dataset.uri))) _check_sync( collection=test_dataset.collection, expected_paths=[ test_dataset.uri, other_dataset.path.as_uri(), ], expected_mismatches=[ mm.LocationMissingOnDisk(test_dataset.dataset, test_dataset.uri), mm.LocationNotIndexed(other_dataset.dataset, test_dataset.uri), mm.LocationMissingOnDisk(other_dataset.dataset, other_dataset.path.as_uri()), ], expected_index_result={ test_dataset.dataset: (), other_dataset.dataset: (test_dataset.uri,), test_dataset.parent: (), }, cache_path=integration_test_data, fix_settings=dict(index_missing=True, update_locations=True) )
def test_replace_on_disk(test_dataset: DatasetForTests, integration_test_data: Path, other_dataset: DatasetForTests): # type: (Tuple[Collection, DatasetLite, str, Path]) -> None """ File on disk has a different id to the one in the index (ie. it was quietly reprocessed) """ test_dataset.add_to_index() # move a new one over the top shutil.move(other_dataset.path, str(uri_to_local_path(test_dataset.uri))) _check_sync( collection=test_dataset.collection, expected_paths=[ test_dataset.uri ], expected_mismatches=[ mm.LocationMissingOnDisk(test_dataset.dataset, test_dataset.uri), mm.DatasetNotIndexed(other_dataset.dataset, test_dataset.uri), ], expected_index_result={ test_dataset.dataset: (), other_dataset.dataset: (test_dataset.uri,), test_dataset.parent: (), }, cache_path=integration_test_data, fix_settings=dict(index_missing=True, update_locations=True) )
def test_detect_corrupt_new(test_dataset: DatasetForTests, integration_test_data: Path): # type: (Tuple[Collection, str, str, Path]) -> None """If a dataset exists but cannot be read handle as corrupt""" path = uri_to_local_path(test_dataset.uri) # Write corrupted file. os.unlink(str(path)) with path.open('w') as f: f.write('corruption!') assert path.exists() # No dataset in index at the corrupt location, so it should be trashed. _check_sync( collection=test_dataset.collection, expected_paths=[test_dataset.uri], expected_mismatches=[ mm.UnreadableDataset(None, test_dataset.uri) ], expected_index_result={}, cache_path=integration_test_data, fix_settings=dict(trash_missing=True, trash_archived=True, update_locations=True) ) assert not path.exists(), "Corrupt dataset without sibling should be trashed with trash_archived=True"
def test_detect_corrupt_existing(test_dataset: DatasetForTests, integration_test_data: Path): # type: (Tuple[Collection, str, str, Path]) -> None """If a dataset exists but cannot be read, report as corrupt""" path = uri_to_local_path(test_dataset.uri) test_dataset.add_to_index() assert path.exists() # Overwrite with corrupted file. os.unlink(str(path)) with path.open('w') as f: f.write('corruption!') assert path.exists() # Another dataset exists in the same location _check_sync( collection=test_dataset.collection, expected_paths=[test_dataset.uri], expected_mismatches=[ # We don't know if it's the same dataset mm.UnreadableDataset(None, test_dataset.uri) ], # Unmodified index expected_index_result=freeze_index(test_dataset.collection.index_), cache_path=integration_test_data, fix_settings=dict(trash_missing=True, trash_archived=True, update_locations=True) ) # If a dataset is in the index pointing to the corrupt location, it shouldn't be trashed with trash_archived=True assert path.exists(), "Corrupt dataset with sibling in index should not be trashed"
def _cleanup_uri(dry_run: bool, index: Index, input_uri: str, min_trash_age_hours: int, log): trash_count = 0 latest_time_to_archive = _as_utc( datetime.utcnow()) - timedelta(hours=min_trash_age_hours) echo( f"Cleaning {'(dry run) ' if dry_run else ''}{style(input_uri, bold=True)}", err=True) locations = _get_archived_locations_within(index, latest_time_to_archive, input_uri) echo( f" {len(locations)} locations archived more than {min_trash_age_hours}hr ago", err=True) with click.progressbar( locations, # stderr should be used for runtime information, not stdout file=sys.stderr) as location_iter: for uri in location_iter: log = log.bind(uri=uri) local_path = uri_to_local_path(uri) if not local_path.exists(): # An index record exists, but the file isn't on the disk. # We won't remove the record from the index: maybe the filesystem is temporarily unmounted? log.warning('location.not_exist') continue # Multiple datasets can point to the same location (eg. a stacked file). indexed_datasets = set( index.datasets.get_datasets_for_location(uri)) # Check that there's no other active locations for this dataset. active_dataset = _get_dataset_where_active(uri, indexed_datasets) if active_dataset: log.info("location.has_active", active_dataset_id=active_dataset.id) continue # Are there any dataset ids in the file that we haven't indexed? Skip it. unindexed_ids = get_unknown_dataset_ids(index, uri) if unindexed_ids: log.info('location.has_unknown', unknown_dataset_ids=unindexed_ids) continue was_trashed = paths.trash_uri(uri, dry_run=dry_run, log=log) if not dry_run: for dataset in indexed_datasets: index.datasets.remove_location(dataset.id, uri) if was_trashed: trash_count += 1 log = log.unbind('uri') return len(locations), trash_count
def get_unknown_dataset_ids(index, uri): """Get ids of datasets in the file that have never been indexed""" on_disk_dataset_ids = set( paths.get_path_dataset_ids(uri_to_local_path(uri))) unknown_ids = set() for dataset_id in on_disk_dataset_ids: if not index.datasets.has(dataset_id): unknown_ids.add(dataset_id) return unknown_ids
def _url2rasterio(url_str, fmt, layer): """ turn URL into a string that could be passed to raterio.open """ url = urlparse(url_str) assert url.scheme, "Expecting URL with scheme here" # if format is NETCDF or HDF need to pass NETCDF:path:band as filename to rasterio/GDAL for nasty_format in ('netcdf', 'hdf'): if nasty_format in fmt.lower(): if url.scheme != 'file': raise RuntimeError("Can't access %s over %s" % (fmt, url.scheme)) filename = '%s:"%s":%s' % (fmt, uri_to_local_path(url_str), layer) return filename if url.scheme and url.scheme != 'file': return url_str # if local path strip scheme and other gunk return str(uri_to_local_path(url_str))
def _build_hdf_uri(url_str: str, fmt: str, layer: str) -> str: if is_vsipath(url_str): base = url_str else: url = urlparse(url_str) if url.scheme in (None, ''): raise ValueError("Expect either URL or /vsi path") if url.scheme != 'file': raise RuntimeError("Can't access %s over %s" % (fmt, url.scheme)) base = str(uri_to_local_path(url_str)) return '{}:"{}":{}'.format(fmt, base, layer)
def add_dataset(index: Index, dataset_id: uuid.UUID, uri: str): """ Index a dataset from a file uri. A better api should be pushed upstream to core: it currently only has a "scripts" implementation intended for cli use. """ path = uri_to_local_path(uri) for d in dataset_script.load_datasets([path], _get_rules(index)): if d.id == dataset_id: index.datasets.add(d, sources_policy='ensure') break else: raise RuntimeError('Dataset not found at path: %s, %s' % (dataset_id, uri))
def _rio_uri(band: BandInfo) -> str: """ - file uris are converted to file names - if also netcdf wrap in NETCDF:"${filename}":${layer} - All other protocols go through unmodified """ if band.uri_scheme == 'file': fname = str(uri_to_local_path(band.uri)) if _is_netcdf(band.format): fname = 'NETCDF:"{}":{}'.format(fname, band.layer) return fname return band.uri
def trash_uri(uri: str, dry_run=False, log=_LOG): local_path = uri_to_local_path(uri) if not local_path.exists(): log.warning("trash.not_exist", path=local_path) return # TODO: to handle sibling-metadata we should trash "all_dataset_paths" too. base_path, all_dataset_files = get_dataset_paths(local_path) trash_path = get_trash_path(base_path) log.info("trashing", base_path=base_path, trash_path=trash_path) if not dry_run: if not trash_path.parent.exists(): os.makedirs(str(trash_path.parent)) os.rename(str(base_path), str(trash_path))
def _url2rasterio(url_str: str, fmt: str, layer: Optional[str]) -> str: """ turn URL into a string that could be passed to raterio.open """ if _is_hdf(fmt): if layer is None: raise ValueError("Missing layer for hdf/netcdf format dataset") return _build_hdf_uri(url_str, fmt, layer) if is_vsipath(url_str): return url_str url = urlparse(url_str) if url.scheme in (None, ''): raise ValueError("Expect either URL or /vsi path") if url.scheme == 'file': # if local path strip scheme and other gunk return str(uri_to_local_path(url_str)) return url_str
def main(index, dry_run, ids): exists = 0 missing = 0 removed = 0 missing_datasets = [] for dataset_id in ids: with index._db.begin() as db: some_exist = False to_remove = [] for uri in db.get_locations(dataset_id): local_path = utils.uri_to_local_path(uri) if local_path.exists(): exists += 1 some_exist = True print("Exists: {}".format(uri)) continue else: missing += 1 to_remove.append(uri) # If there are some valid locations, remove invalid ones. if some_exist: for uri in to_remove: was_removed = db.remove_location( dataset_id, uri) if not dry_run else False print("Removed ({}): {}".format(was_removed, uri)) if was_removed: removed += 1 # If there's no valid locations, note them for later else: missing_datasets.append(dataset_id) print("Done: skipped {}, missing {}, removed {}".format( exists, missing, removed)) print("{} without locations:".format(len(missing_datasets))) if missing_datasets: print("\t" + ("\n\t".join(missing_datasets)))
def add_dataset(index: Index, dataset_id: uuid.UUID, uri: str): """ Index a dataset from a file uri. A better api should be pushed upstream to core: it currently only has a "scripts" implementation intended for cli use. """ yaml_path = uri_to_local_path(uri) def load_datasets(path, ds_resolve): for uri, ds in ui_path_doc_stream(path): dataset, err = ds_resolve(ds, uri) if dataset is None: _LOG.error('dataset is empty', error=str(err)) continue is_consistent, reason = check_dataset_consistent(dataset) if not is_consistent: _LOG.error("dataset inconsistency", dataset=dataset.id, reason=str(reason)) continue yield dataset ds_resolve = Doc2Dataset(index) for d in load_datasets([yaml_path], ds_resolve): if d.id == dataset_id: try: index.datasets.add(d) _LOG.info("dataset indexing successful", dataset_id=dataset_id) break except ValueError as err: _LOG.error('failed to index dataset', dataset_id=dataset_id, error=err) else: raise RuntimeError('dataset not found at path: %s, %s' % (dataset_id, uri))
def local_path(self) -> Optional[Path]: """ A path to this dataset on the local filesystem (if available). """ return uri_to_local_path(self.local_uri)