def test_new_and_old_on_disk(test_dataset: DatasetForTests, integration_test_data: Path, other_dataset: DatasetForTests): # type: (Tuple[Collection, DatasetLite, str, Path]) -> None # ls8_collection, on_disk, on_disk_uri, root = syncable_environment old_indexed = DatasetLite(uuid.UUID('5294efa6-348d-11e7-a079-185e0f80a5c0')) # An indexed file not on disk, and disk file not in index. missing_dataset = other_dataset missing_dataset.add_to_index() # Make it missing shutil.rmtree(str(missing_dataset.copyable_path)) _check_sync( collection=test_dataset.collection, expected_paths=[ missing_dataset.uri, test_dataset.uri ], expected_mismatches=[ mm.LocationMissingOnDisk(old_indexed, missing_dataset.uri), mm.DatasetNotIndexed(test_dataset.dataset, test_dataset.uri) ], expected_index_result={ test_dataset.dataset: (test_dataset.uri,), old_indexed: (), test_dataset.parent: (), }, cache_path=integration_test_data, fix_settings=dict(index_missing=True, update_locations=True) )
def _find_uri_mismatches(index_url: str, uri: str, validate_data=True) -> Iterable[Mismatch]: """ Compare the index and filesystem contents for the given uris, yielding Mismatches of any differences. """ # pylint: disable=protected-access index = Index(PostgresDb(PostgresDb._create_engine(index_url))) def ids(datasets): return [d.id for d in datasets] path = uri_to_local_path(uri) log = _LOG.bind(path=path) log.debug("index.get_dataset_ids_for_uri") indexed_datasets = set(get_datasets_for_uri(index, uri)) datasets_in_file = set() # type: Set[DatasetLite] if path.exists(): try: datasets_in_file = set(map(DatasetLite, paths.get_path_dataset_ids(path))) except InvalidDocException as e: # Should we do something with indexed_datasets here? If there's none, we're more willing to trash. log.info("invalid_path", error_args=e.args) yield UnreadableDataset(None, uri) return log.info("dataset_ids", indexed_dataset_ids=ids(indexed_datasets), file_ids=ids(datasets_in_file)) if validate_data: validation_success = validate.validate_dataset(path, log=log) if not validation_success: yield InvalidDataset(None, uri) return for indexed_dataset in indexed_datasets: # Does the dataset exist in the file? if indexed_dataset in datasets_in_file: if indexed_dataset.is_archived: yield ArchivedDatasetOnDisk(indexed_dataset, uri) else: yield LocationMissingOnDisk(indexed_dataset, uri) # For all file ids not in the index. file_ds_not_in_index = datasets_in_file.difference(indexed_datasets) if not file_ds_not_in_index: log.info("no mismatch found (dataset already indexed)") for dataset in file_ds_not_in_index: # If it's already indexed, we just need to add the location. indexed_dataset = index.datasets.get(dataset.id) if indexed_dataset: log.info("location_not_indexed", indexed_dataset=indexed_dataset) yield LocationNotIndexed(DatasetLite.from_agdc(indexed_dataset), uri) else: log.info("dataset_not_index", dataset=dataset, uri=uri) yield DatasetNotIndexed(dataset, uri)
def test_archived_on_disk(test_dataset: DatasetForTests, integration_test_data: Path): # type: (Tuple[Collection, DatasetLite, str, Path]) -> None """ A an already-archived dataset on disk. Should report it, but not touch the file (trash_archived is false) """ # archived_on_disk = DatasetLite(on_disk.id, archived_time=(datetime.utcnow() - timedelta(days=5))) test_dataset.add_to_index() test_dataset.archive_in_index() archived_time = test_dataset.get_index_record().archived_time assert uri_to_local_path(test_dataset.uri).exists(), "On-disk location should exist before test begins." _check_sync( collection=test_dataset.collection, expected_paths=[ test_dataset.uri ], expected_mismatches=[ mm.ArchivedDatasetOnDisk(DatasetLite(test_dataset.dataset.id, archived_time), test_dataset.uri), ], expected_index_result={ # Not active in index, as it's archived. # on_disk: (on_disk_uri,), # But the parent dataset still is: test_dataset.parent: (), }, cache_path=integration_test_data, fix_settings=dict(index_missing=True, update_locations=True) ) assert uri_to_local_path(test_dataset.uri).exists(), "On-disk location shouldn't be touched"
def from_dict(row: dict): mismatch_class = getattr(sys.modules[__name__], strutils.under2camel(row['name'])) dataset_id = row['dataset_id'].strip() dataset = None if dataset_id and dataset_id != 'None': dataset = DatasetLite(UUID(dataset_id)) return mismatch_class(dataset, row['uri'].strip())
def freeze_index(index: Index) -> Mapping[DatasetLite, Iterable[str]]: """ All contained (dataset_id, [location]) values, to check test results. """ return dict( ( DatasetLite(dataset.id, archived_time=dataset.archived_time), tuple(dataset.uris) ) for dataset in index.datasets.search() )
def test_is_trashed(test_dataset: DatasetForTests, integration_test_data: Path, archived_dt, expect_to_be_trashed): root = integration_test_data # Same test, but trash_archived=True, so it should be renamed to the. register_base_directory(root) test_dataset.add_to_index() test_dataset.archive_in_index(archived_dt=archived_dt) archived_on_disk = DatasetLite(test_dataset.dataset.id, archived_time=archived_dt) trashed_path = test_dataset.base_path.joinpath(*_TRASH_PREFIX, *test_dataset.path_offset) # Before the test, file is in place and nothing trashed. assert test_dataset.path.exists( ), "On-disk location should exist before test begins." assert not trashed_path.exists(), "Trashed file shouldn't exit." _check_sync( collection=test_dataset.collection, expected_paths=[test_dataset.uri], expected_mismatches=[ mm.ArchivedDatasetOnDisk(archived_on_disk, test_dataset.uri), ], expected_index_result={ # Archived: shouldn't be active in index. # on_disk: (on_disk_uri,), # Prov parent should still exist as it wasn't archived. test_dataset.parent: (), }, cache_path=root, fix_settings=dict(index_missing=True, update_locations=True, trash_archived=True)) # Show output structure for debugging # print("Output structure") # for p in paths.list_file_paths(root): # print(f"\t{p}") if expect_to_be_trashed: assert trashed_path.exists(), "File isn't in trash." assert not test_dataset.path.exists( ), "On-disk location still exists (should have been moved to trash)." else: assert not trashed_path.exists(), "File shouldn't have been trashed." assert test_dataset.path.exists( ), "On-disk location should still be in place."
def test_load_dump_mismatch(): mismatch = DatasetNotIndexed( DatasetLite(UUID("c98c3f2e-add7-4b34-9c9f-2cb8c7f806d2")), uri= 'file:///g/data/fk4/datacube/002/LS5_TM_FC/-17_-31/LS5_TM_FC_3577_-17_-31_19920722013931500000.nc' ) row = mismatch.to_dict() assert row == { 'name': 'dataset_not_indexed', 'dataset_id': 'c98c3f2e-add7-4b34-9c9f-2cb8c7f806d2', 'uri': 'file:///g/data/fk4/datacube/002/LS5_TM_FC/-17_-31/LS5_TM_FC_3577_-17_-31_19920722013931500000.nc', } deserialised_mismatch = Mismatch.from_dict(row) assert deserialised_mismatch == mismatch assert deserialised_mismatch.__dict__ == mismatch.__dict__
def test_load_from_file(): root = write_files({ 'outputs.jsonl': """ {"name":"archived_dataset_on_disk","dataset_id":"582e9a74-d343-42d2-9105-a248b4b04f4a",\ "uri":"file:///g/data/fk4/datacube/002/LS5_TM_FC/-10_-39/LS5_TM_FC_3577_-10_-39_19990918011811500000.nc"} {"name":"unreadable_dataset", "dataset_id":"None","uri":\ "file:///g/data/fk4/datacube/002/LS5_TM_FC/0_-30/LS5_TM_FC_3577_0_-30_20080331005819500000.nc"} """ }) mismatches = list(mismatches_from_file(root.joinpath('outputs.jsonl'))) assert mismatches == [ ArchivedDatasetOnDisk( DatasetLite(UUID('582e9a74-d343-42d2-9105-a248b4b04f4a')), 'file:///g/data/fk4/datacube/002/LS5_TM_FC/-10_-39/LS5_TM_FC_3577_-10_-39_19990918011811500000.nc' ), UnreadableDataset( None, 'file:///g/data/fk4/datacube/002/LS5_TM_FC/0_-30/LS5_TM_FC_3577_0_-30_20080331005819500000.nc' ) ]
def parent(self) -> Optional[DatasetLite]: """Source datasets that will be indexed if on_disk1 is indexed""" return DatasetLite(self.parent_id) if self.parent_id else None
def dataset(self): return DatasetLite(self.id_)
def work_path(tmpdir): paths.NCI_WORK_ROOT = Path(tmpdir) / 'work' paths.NCI_WORK_ROOT.mkdir() # The default use of timestamp will collide when run quickly, as in unit tests. paths._JOB_WORK_OFFSET = '{output_product}-{task_type}-{request_uuid}' return paths.NCI_WORK_ROOT @pytest.fixture def integration_test_data(tmpdir): temp_data_dir = Path(tmpdir) / 'integration_data' shutil.copytree(INTEGRATION_TEST_DATA, temp_data_dir) return temp_data_dir ON_DISK2_ID = DatasetLite(uuid.UUID('10c4a9fe-2890-11e6-8ec8-a0000100fe80')) ON_DISK2_OFFSET = ('LS8_OLITIRS_OTH_P51_GALPGS01-032_114_080_20150924', 'ga-metadata.yaml') class DatasetForTests(NamedTuple): """ A test dataset, including the file location and collection it should belong to. When your test starts the dataset will be on disk but not yet indexed. Call add_to_index() and others as needed. All properties are recorded here separately so tests can verify them independently. """ # The test collection this should belong to collection: Collection