示例#1
0
def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]:
    """
    Compare the index and filesystem contents for the given uris,
    yielding Mismatches of any differences.
    """

    # pylint: disable=protected-access
    index = Index(PostgresDb(PostgresDb._create_engine(index_url)))

    def ids(datasets):
        return [d.id for d in datasets]

    path = uri_to_local_path(uri)
    log = _LOG.bind(path=path)
    log.debug("index.get_dataset_ids_for_uri")
    indexed_datasets = set(get_datasets_for_uri(index, uri))

    datasets_in_file = set()  # type: Set[DatasetLite]
    if path.exists():
        try:
            datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path)))
        except InvalidDocException as e:
            # Should we do something with indexed_datasets here? If there's none, we're more willing to trash.
            log.info("invalid_path", error_args=e.args)
            yield UnreadableDataset(None, uri)
            return

        log.info("dataset_ids",
                 indexed_dataset_ids=ids(indexed_datasets),
                 file_ids=ids(datasets_in_file))

        if validate_data:
            validation_success = validate.validate_dataset(path, log=log)
            if not validation_success:
                yield InvalidDataset(None, uri)
                return

    for indexed_dataset in indexed_datasets:
        # Does the dataset exist in the file?
        if indexed_dataset in datasets_in_file:
            if indexed_dataset.is_archived:
                yield ArchivedDatasetOnDisk(indexed_dataset, uri)
        else:
            yield LocationMissingOnDisk(indexed_dataset, uri)

    # For all file ids not in the index.
    file_ds_not_in_index = datasets_in_file.difference(indexed_datasets)

    if not file_ds_not_in_index:
        log.info("no mismatch found (dataset already indexed)")

    for dataset in file_ds_not_in_index:
        # If it's already indexed, we just need to add the location.
        indexed_dataset = index.datasets.get(dataset.id)
        if indexed_dataset:
            log.info("location_not_indexed", indexed_dataset=indexed_dataset)
            yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri)
        else:
            log.info("dataset_not_index", dataset=dataset, uri=uri)
            yield DatasetNotIndexed(dataset, uri)
示例#2
0
def get_unknown_dataset_ids(index, uri):
    """Get ids of datasets in the file that have never been indexed"""
    on_disk_dataset_ids = set(
        paths.get_path_dataset_ids(uri_to_local_path(uri)))
    unknown_ids = set()
    for dataset_id in on_disk_dataset_ids:
        if not index.datasets.has(dataset_id):
            unknown_ids.add(dataset_id)

    return unknown_ids
def _should_restore(index, trashed_nc):
    dataset_ids = paths.get_path_dataset_ids(trashed_nc)
    original_path = paths.get_original_path(trashed_nc)

    for dataset_id in dataset_ids:
        dataset = index.datasets.get(dataset_id)

        if dataset.is_archived:
            _LOG.debug("dataset.skip.archived", dataset_id=dataset.id)
            continue
        if original_path.as_uri() not in dataset.uris:
            _LOG.debug("dataset.skip.unknown_location", dataset_id=dataset.id)
            continue
        # There's something else in the location?
        if original_path.exists():
            _LOG.debug("dataset.skip.original_exists", dataset_id=dataset.id)
            continue

        # We've found an indexed, active dataset in the file, so restore.
        return original_path