def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]: """ Compare the index and filesystem contents for the given uris, yielding Mismatches of any differences. """ # pylint: disable=protected-access index = Index(PostgresDb(PostgresDb._create_engine(index_url))) def ids(datasets): return [d.id for d in datasets] path = uri_to_local_path(uri) log = _LOG.bind(path=path) log.debug("index.get_dataset_ids_for_uri") indexed_datasets = set(get_datasets_for_uri(index, uri)) datasets_in_file = set() # type: Set[DatasetLite] if path.exists(): try: datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path))) except InvalidDocException as e: # Should we do something with indexed_datasets here? If there's none, we're more willing to trash. log.info("invalid_path", error_args=e.args) yield UnreadableDataset(None, uri) return log.info("dataset_ids", indexed_dataset_ids=ids(indexed_datasets), file_ids=ids(datasets_in_file)) if validate_data: validation_success = validate.validate_dataset(path, log=log) if not validation_success: yield InvalidDataset(None, uri) return for indexed_dataset in indexed_datasets: # Does the dataset exist in the file? if indexed_dataset in datasets_in_file: if indexed_dataset.is_archived: yield ArchivedDatasetOnDisk(indexed_dataset, uri) else: yield LocationMissingOnDisk(indexed_dataset, uri) # For all file ids not in the index. file_ds_not_in_index = datasets_in_file.difference(indexed_datasets) if not file_ds_not_in_index: log.info("no mismatch found (dataset already indexed)") for dataset in file_ds_not_in_index: # If it's already indexed, we just need to add the location. indexed_dataset = index.datasets.get(dataset.id) if indexed_dataset: log.info("location_not_indexed", indexed_dataset=indexed_dataset) yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri) else: log.info("dataset_not_index", dataset=dataset, uri=uri) yield DatasetNotIndexed(dataset, uri)
def test_with_standard_index(uninitialised_postgres_db): index = Index(uninitialised_postgres_db) index.init_db()