Exemplo n.º 1
0
def test_archived_on_disk(test_dataset: DatasetForTests,
                          integration_test_data: Path):
    # type: (Tuple[Collection, DatasetLite, str, Path]) -> None
    """
    A an already-archived dataset on disk. Should report it, but not touch the file (trash_archived is false)
    """
    # archived_on_disk = DatasetLite(on_disk.id, archived_time=(datetime.utcnow() - timedelta(days=5)))
    test_dataset.add_to_index()
    test_dataset.archive_in_index()
    archived_time = test_dataset.get_index_record().archived_time

    assert uri_to_local_path(test_dataset.uri).exists(), "On-disk location should exist before test begins."
    _check_sync(
        collection=test_dataset.collection,
        expected_paths=[
            test_dataset.uri
        ],
        expected_mismatches=[
            mm.ArchivedDatasetOnDisk(DatasetLite(test_dataset.dataset.id, archived_time), test_dataset.uri),
        ],
        expected_index_result={
            # Not active in index, as it's archived.
            # on_disk: (on_disk_uri,),
            # But the parent dataset still is:
            test_dataset.parent: (),
        },
        cache_path=integration_test_data,
        fix_settings=dict(index_missing=True, update_locations=True)
    )
    assert uri_to_local_path(test_dataset.uri).exists(), "On-disk location shouldn't be touched"
Exemplo n.º 2
0
def test_uri_to_local_path():
    if os.name == 'nt':
        assert 'C:\\tmp\\test.tmp' == str(uri_to_local_path('file:///C:/tmp/test.tmp'))

    else:
        assert '/tmp/something.txt' == str(uri_to_local_path('file:///tmp/something.txt'))

    assert uri_to_local_path(None) is None

    with pytest.raises(ValueError):
        uri_to_local_path('ftp://example.com/tmp/something.txt')
Exemplo n.º 3
0
def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]:
    """
    Compare the index and filesystem contents for the given uris,
    yielding Mismatches of any differences.
    """

    # pylint: disable=protected-access
    index = Index(PostgresDb(PostgresDb._create_engine(index_url)))

    def ids(datasets):
        return [d.id for d in datasets]

    path = uri_to_local_path(uri)
    log = _LOG.bind(path=path)
    log.debug("index.get_dataset_ids_for_uri")
    indexed_datasets = set(get_datasets_for_uri(index, uri))

    datasets_in_file = set()  # type: Set[DatasetLite]
    if path.exists():
        try:
            datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path)))
        except InvalidDocException as e:
            # Should we do something with indexed_datasets here? If there's none, we're more willing to trash.
            log.info("invalid_path", error_args=e.args)
            yield UnreadableDataset(None, uri)
            return

        log.info("dataset_ids",
                 indexed_dataset_ids=ids(indexed_datasets),
                 file_ids=ids(datasets_in_file))

        if validate_data:
            validation_success = validate.validate_dataset(path, log=log)
            if not validation_success:
                yield InvalidDataset(None, uri)
                return

    for indexed_dataset in indexed_datasets:
        # Does the dataset exist in the file?
        if indexed_dataset in datasets_in_file:
            if indexed_dataset.is_archived:
                yield ArchivedDatasetOnDisk(indexed_dataset, uri)
        else:
            yield LocationMissingOnDisk(indexed_dataset, uri)

    # For all file ids not in the index.
    file_ds_not_in_index = datasets_in_file.difference(indexed_datasets)

    if not file_ds_not_in_index:
        log.info("no mismatch found (dataset already indexed)")

    for dataset in file_ds_not_in_index:
        # If it's already indexed, we just need to add the location.
        indexed_dataset = index.datasets.get(dataset.id)
        if indexed_dataset:
            log.info("location_not_indexed", indexed_dataset=indexed_dataset)
            yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri)
        else:
            log.info("dataset_not_index", dataset=dataset, uri=uri)
            yield DatasetNotIndexed(dataset, uri)
Exemplo n.º 4
0
    def local_path(self):
        """
        A path to this dataset on the local filesystem (if available).

        :rtype: pathlib.Path
        """
        return uri_to_local_path(self.local_uri)
Exemplo n.º 5
0
def test_move_on_disk(test_dataset: DatasetForTests,
                      integration_test_data: Path,
                      other_dataset: DatasetForTests):
    # type: (Tuple[Collection, DatasetLite, str, Path]) -> None
    """
    Indexed dataset was moved over the top of another indexed dataset
    """
    test_dataset.add_to_index()
    other_dataset.add_to_index()

    shutil.move(other_dataset.path, str(uri_to_local_path(test_dataset.uri)))

    _check_sync(
        collection=test_dataset.collection,
        expected_paths=[
            test_dataset.uri,
            other_dataset.path.as_uri(),
        ],
        expected_mismatches=[
            mm.LocationMissingOnDisk(test_dataset.dataset, test_dataset.uri),
            mm.LocationNotIndexed(other_dataset.dataset, test_dataset.uri),
            mm.LocationMissingOnDisk(other_dataset.dataset, other_dataset.path.as_uri()),
        ],
        expected_index_result={
            test_dataset.dataset: (),
            other_dataset.dataset: (test_dataset.uri,),
            test_dataset.parent: (),
        },
        cache_path=integration_test_data,
        fix_settings=dict(index_missing=True, update_locations=True)
    )
Exemplo n.º 6
0
def test_replace_on_disk(test_dataset: DatasetForTests,
                         integration_test_data: Path,
                         other_dataset: DatasetForTests):
    # type: (Tuple[Collection, DatasetLite, str, Path]) -> None
    """
    File on disk has a different id to the one in the index (ie. it was quietly reprocessed)
    """
    test_dataset.add_to_index()

    # move a new one over the top
    shutil.move(other_dataset.path, str(uri_to_local_path(test_dataset.uri)))

    _check_sync(
        collection=test_dataset.collection,
        expected_paths=[
            test_dataset.uri
        ],
        expected_mismatches=[
            mm.LocationMissingOnDisk(test_dataset.dataset, test_dataset.uri),
            mm.DatasetNotIndexed(other_dataset.dataset, test_dataset.uri),
        ],
        expected_index_result={
            test_dataset.dataset: (),
            other_dataset.dataset: (test_dataset.uri,),
            test_dataset.parent: (),
        },
        cache_path=integration_test_data,
        fix_settings=dict(index_missing=True, update_locations=True)
    )
Exemplo n.º 7
0
def test_detect_corrupt_new(test_dataset: DatasetForTests,
                            integration_test_data: Path):
    # type: (Tuple[Collection, str, str, Path]) -> None
    """If a dataset exists but cannot be read handle as corrupt"""

    path = uri_to_local_path(test_dataset.uri)

    # Write corrupted file.
    os.unlink(str(path))
    with path.open('w') as f:
        f.write('corruption!')
    assert path.exists()

    # No dataset in index at the corrupt location, so it should be trashed.
    _check_sync(
        collection=test_dataset.collection,
        expected_paths=[test_dataset.uri],
        expected_mismatches=[
            mm.UnreadableDataset(None, test_dataset.uri)
        ],
        expected_index_result={},
        cache_path=integration_test_data,
        fix_settings=dict(trash_missing=True, trash_archived=True, update_locations=True)
    )
    assert not path.exists(), "Corrupt dataset without sibling should be trashed with trash_archived=True"
Exemplo n.º 8
0
def test_detect_corrupt_existing(test_dataset: DatasetForTests,
                                 integration_test_data: Path):
    # type: (Tuple[Collection, str, str, Path]) -> None
    """If a dataset exists but cannot be read, report as corrupt"""
    path = uri_to_local_path(test_dataset.uri)

    test_dataset.add_to_index()
    assert path.exists()

    # Overwrite with corrupted file.
    os.unlink(str(path))
    with path.open('w') as f:
        f.write('corruption!')
    assert path.exists()

    # Another dataset exists in the same location

    _check_sync(
        collection=test_dataset.collection,
        expected_paths=[test_dataset.uri],
        expected_mismatches=[
            # We don't know if it's the same dataset
            mm.UnreadableDataset(None, test_dataset.uri)
        ],
        # Unmodified index
        expected_index_result=freeze_index(test_dataset.collection.index_),
        cache_path=integration_test_data,
        fix_settings=dict(trash_missing=True, trash_archived=True, update_locations=True)
    )
    # If a dataset is in the index pointing to the corrupt location, it shouldn't be trashed with trash_archived=True
    assert path.exists(), "Corrupt dataset with sibling in index should not be trashed"
Exemplo n.º 9
0
def _cleanup_uri(dry_run: bool, index: Index, input_uri: str,
                 min_trash_age_hours: int, log):
    trash_count = 0

    latest_time_to_archive = _as_utc(
        datetime.utcnow()) - timedelta(hours=min_trash_age_hours)

    echo(
        f"Cleaning {'(dry run) ' if dry_run else ''}{style(input_uri, bold=True)}",
        err=True)

    locations = _get_archived_locations_within(index, latest_time_to_archive,
                                               input_uri)
    echo(
        f"  {len(locations)} locations archived more than {min_trash_age_hours}hr ago",
        err=True)
    with click.progressbar(
            locations,
            # stderr should be used for runtime information, not stdout
            file=sys.stderr) as location_iter:
        for uri in location_iter:
            log = log.bind(uri=uri)
            local_path = uri_to_local_path(uri)
            if not local_path.exists():
                # An index record exists, but the file isn't on the disk.
                # We won't remove the record from the index: maybe the filesystem is temporarily unmounted?
                log.warning('location.not_exist')
                continue

            # Multiple datasets can point to the same location (eg. a stacked file).
            indexed_datasets = set(
                index.datasets.get_datasets_for_location(uri))

            # Check that there's no other active locations for this dataset.
            active_dataset = _get_dataset_where_active(uri, indexed_datasets)
            if active_dataset:
                log.info("location.has_active",
                         active_dataset_id=active_dataset.id)
                continue

            # Are there any dataset ids in the file that we haven't indexed? Skip it.
            unindexed_ids = get_unknown_dataset_ids(index, uri)
            if unindexed_ids:
                log.info('location.has_unknown',
                         unknown_dataset_ids=unindexed_ids)
                continue

            was_trashed = paths.trash_uri(uri, dry_run=dry_run, log=log)
            if not dry_run:
                for dataset in indexed_datasets:
                    index.datasets.remove_location(dataset.id, uri)

            if was_trashed:
                trash_count += 1

            log = log.unbind('uri')
    return len(locations), trash_count
Exemplo n.º 10
0
def get_unknown_dataset_ids(index, uri):
    """Get ids of datasets in the file that have never been indexed"""
    on_disk_dataset_ids = set(
        paths.get_path_dataset_ids(uri_to_local_path(uri)))
    unknown_ids = set()
    for dataset_id in on_disk_dataset_ids:
        if not index.datasets.has(dataset_id):
            unknown_ids.add(dataset_id)

    return unknown_ids
Exemplo n.º 11
0
def _url2rasterio(url_str, fmt, layer):
    """
    turn URL into a string that could be passed to raterio.open
    """
    url = urlparse(url_str)
    assert url.scheme, "Expecting URL with scheme here"

    # if format is NETCDF or HDF need to pass NETCDF:path:band as filename to rasterio/GDAL
    for nasty_format in ('netcdf', 'hdf'):
        if nasty_format in fmt.lower():
            if url.scheme != 'file':
                raise RuntimeError("Can't access %s over %s" % (fmt, url.scheme))
            filename = '%s:"%s":%s' % (fmt, uri_to_local_path(url_str), layer)
            return filename

    if url.scheme and url.scheme != 'file':
        return url_str

    # if local path strip scheme and other gunk
    return str(uri_to_local_path(url_str))
Exemplo n.º 12
0
def _build_hdf_uri(url_str: str, fmt: str, layer: str) -> str:
    if is_vsipath(url_str):
        base = url_str
    else:
        url = urlparse(url_str)
        if url.scheme in (None, ''):
            raise ValueError("Expect either URL or /vsi path")

        if url.scheme != 'file':
            raise RuntimeError("Can't access %s over %s" % (fmt, url.scheme))
        base = str(uri_to_local_path(url_str))

    return '{}:"{}":{}'.format(fmt, base, layer)
Exemplo n.º 13
0
def add_dataset(index: Index, dataset_id: uuid.UUID, uri: str):
    """
    Index a dataset from a file uri.

    A better api should be pushed upstream to core: it currently only has a "scripts" implementation
    intended for cli use.
    """
    path = uri_to_local_path(uri)
    for d in dataset_script.load_datasets([path], _get_rules(index)):
        if d.id == dataset_id:
            index.datasets.add(d, sources_policy='ensure')
            break
    else:
        raise RuntimeError('Dataset not found at path: %s, %s' %
                           (dataset_id, uri))
Exemplo n.º 14
0
def _rio_uri(band: BandInfo) -> str:
    """
    - file uris are converted to file names
       - if also netcdf wrap in NETCDF:"${filename}":${layer}
    - All other protocols go through unmodified
    """
    if band.uri_scheme == 'file':
        fname = str(uri_to_local_path(band.uri))

        if _is_netcdf(band.format):
            fname = 'NETCDF:"{}":{}'.format(fname, band.layer)

        return fname

    return band.uri
Exemplo n.º 15
0
def trash_uri(uri: str, dry_run=False, log=_LOG):
    local_path = uri_to_local_path(uri)

    if not local_path.exists():
        log.warning("trash.not_exist", path=local_path)
        return

    # TODO: to handle sibling-metadata we should trash "all_dataset_paths" too.
    base_path, all_dataset_files = get_dataset_paths(local_path)

    trash_path = get_trash_path(base_path)

    log.info("trashing", base_path=base_path, trash_path=trash_path)

    if not dry_run:
        if not trash_path.parent.exists():
            os.makedirs(str(trash_path.parent))
        os.rename(str(base_path), str(trash_path))
Exemplo n.º 16
0
def _url2rasterio(url_str: str, fmt: str, layer: Optional[str]) -> str:
    """
    turn URL into a string that could be passed to raterio.open
    """
    if _is_hdf(fmt):
        if layer is None:
            raise ValueError("Missing layer for hdf/netcdf format dataset")

        return _build_hdf_uri(url_str, fmt, layer)

    if is_vsipath(url_str):
        return url_str

    url = urlparse(url_str)
    if url.scheme in (None, ''):
        raise ValueError("Expect either URL or /vsi path")

    if url.scheme == 'file':
        # if local path strip scheme and other gunk
        return str(uri_to_local_path(url_str))

    return url_str
Exemplo n.º 17
0
def main(index, dry_run, ids):
    exists = 0
    missing = 0
    removed = 0

    missing_datasets = []
    for dataset_id in ids:
        with index._db.begin() as db:
            some_exist = False
            to_remove = []
            for uri in db.get_locations(dataset_id):
                local_path = utils.uri_to_local_path(uri)

                if local_path.exists():
                    exists += 1
                    some_exist = True
                    print("Exists: {}".format(uri))
                    continue
                else:
                    missing += 1
                    to_remove.append(uri)
            # If there are some valid locations, remove invalid ones.
            if some_exist:
                for uri in to_remove:
                    was_removed = db.remove_location(
                        dataset_id, uri) if not dry_run else False
                    print("Removed ({}): {}".format(was_removed, uri))
                    if was_removed:
                        removed += 1
            # If there's no valid locations, note them for later
            else:
                missing_datasets.append(dataset_id)

    print("Done: skipped {}, missing {}, removed {}".format(
        exists, missing, removed))
    print("{} without locations:".format(len(missing_datasets)))
    if missing_datasets:
        print("\t" + ("\n\t".join(missing_datasets)))
Exemplo n.º 18
0
def add_dataset(index: Index, dataset_id: uuid.UUID, uri: str):
    """
    Index a dataset from a file uri.

    A better api should be pushed upstream to core: it currently only has a "scripts" implementation
    intended for cli use.
    """
    yaml_path = uri_to_local_path(uri)

    def load_datasets(path, ds_resolve):
        for uri, ds in ui_path_doc_stream(path):

            dataset, err = ds_resolve(ds, uri)

            if dataset is None:
                _LOG.error('dataset is empty', error=str(err))
                continue

            is_consistent, reason = check_dataset_consistent(dataset)
            if not is_consistent:
                _LOG.error("dataset inconsistency", dataset=dataset.id, reason=str(reason))
                continue

            yield dataset

    ds_resolve = Doc2Dataset(index)

    for d in load_datasets([yaml_path], ds_resolve):
        if d.id == dataset_id:
            try:
                index.datasets.add(d)
                _LOG.info("dataset indexing successful", dataset_id=dataset_id)
                break
            except ValueError as err:
                _LOG.error('failed to index dataset', dataset_id=dataset_id, error=err)
    else:
        raise RuntimeError('dataset not found at path: %s, %s' % (dataset_id, uri))
Exemplo n.º 19
0
 def local_path(self) -> Optional[Path]:
     """
     A path to this dataset on the local filesystem (if available).
     """
     return uri_to_local_path(self.local_uri)