def test_index_dataset_with_location(index: Index, default_metadata_type: MetadataType): first_file = Path('/tmp/first/something.yaml').absolute() second_file = Path('/tmp/second/something.yaml').absolute() type_ = index.products.add_document(_pseudo_telemetry_dataset_type) dataset = Dataset(type_, _telemetry_dataset, uris=[first_file.as_uri()], sources={}) index.datasets.add(dataset) stored = index.datasets.get(dataset.id) assert stored.id == _telemetry_uuid # TODO: Dataset types? assert stored.type.id == type_.id assert stored.metadata_type.id == default_metadata_type.id assert stored.local_path == Path(first_file) # Ingesting again should have no effect. index.datasets.add(dataset) stored = index.datasets.get(dataset.id) locations = index.datasets.get_locations(dataset.id) assert len(locations) == 1 # Remove the location was_removed = index.datasets.remove_location(dataset.id, first_file.as_uri()) assert was_removed was_removed = index.datasets.remove_location(dataset.id, first_file.as_uri()) assert not was_removed locations = index.datasets.get_locations(dataset.id) assert len(locations) == 0 # Re-add the location was_added = index.datasets.add_location(dataset.id, first_file.as_uri()) assert was_added was_added = index.datasets.add_location(dataset.id, first_file.as_uri()) assert not was_added locations = index.datasets.get_locations(dataset.id) assert len(locations) == 1 # A rough date is ok: 1:01 beforehand just in case someone runs this during daylight savings time conversion :) # (any UTC conversion errors will be off by much more than this for PST/AEST) before_archival_dt = utc_now() - datetime.timedelta(hours=1, minutes=1) was_archived = index.datasets.archive_location(dataset.id, first_file.as_uri()) assert was_archived locations = index.datasets.get_locations(dataset.id) assert locations == [] locations = index.datasets.get_archived_locations(dataset.id) assert locations == [first_file.as_uri()] # It should return the time archived. location_times = index.datasets.get_archived_location_times(dataset.id) assert len(location_times) == 1 location, archived_time = location_times[0] assert location == first_file.as_uri() assert utc_now() > archived_time > before_archival_dt was_restored = index.datasets.restore_location(dataset.id, first_file.as_uri()) assert was_restored locations = index.datasets.get_locations(dataset.id) assert len(locations) == 1 # Indexing with a new path should NOT add the second one. dataset.uris = [second_file.as_uri()] index.datasets.add(dataset) stored = index.datasets.get(dataset.id) locations = index.datasets.get_locations(dataset.id) assert len(locations) == 1 # Add location manually instead index.datasets.add_location(dataset.id, second_file.as_uri()) stored = index.datasets.get(dataset.id) assert len(stored.uris) == 2 # Newest to oldest. assert stored.uris == [second_file.as_uri(), first_file.as_uri()] # And the second one is newer, so it should be returned as the default local path: assert stored.local_path == Path(second_file) # Can archive and restore the first file, and location order is preserved was_archived = index.datasets.archive_location(dataset.id, first_file.as_uri()) assert was_archived locations = index.datasets.get_locations(dataset.id) assert locations == [second_file.as_uri()] was_restored = index.datasets.restore_location(dataset.id, first_file.as_uri()) assert was_restored locations = index.datasets.get_locations(dataset.id) assert locations == [second_file.as_uri(), first_file.as_uri()] # Can archive and restore the second file, and location order is preserved was_archived = index.datasets.archive_location(dataset.id, second_file.as_uri()) assert was_archived locations = index.datasets.get_locations(dataset.id) assert locations == [first_file.as_uri()] was_restored = index.datasets.restore_location(dataset.id, second_file.as_uri()) assert was_restored locations = index.datasets.get_locations(dataset.id) assert locations == [second_file.as_uri(), first_file.as_uri()] # Indexing again without location should have no effect. dataset.uris = [] index.datasets.add(dataset) stored = index.datasets.get(dataset.id) locations = index.datasets.get_locations(dataset.id) assert len(locations) == 2 # Newest to oldest. assert locations == [second_file.as_uri(), first_file.as_uri()] # And the second one is newer, so it should be returned as the default local path: assert stored.local_path == Path(second_file) # Check order of uris is preserved when indexing with more than one second_ds_doc = copy.deepcopy(_telemetry_dataset) second_ds_doc['id'] = '366f32d8-e1f8-11e6-94b4-185e0f80a589' index.datasets.add( Dataset(type_, second_ds_doc, uris=['file:///a', 'file:///b'], sources={})) # test order using get_locations function assert index.datasets.get_locations( second_ds_doc['id']) == ['file:///a', 'file:///b'] # test order using datasets.get(), it has custom query as it turns out assert index.datasets.get( second_ds_doc['id']).uris == ['file:///a', 'file:///b'] # test update, this should prepend file:///c, file:///d to the existing list index.datasets.update( Dataset(type_, second_ds_doc, uris=['file:///a', 'file:///c', 'file:///d'], sources={})) assert index.datasets.get_locations(second_ds_doc['id']) == [ 'file:///c', 'file:///d', 'file:///a', 'file:///b' ] assert index.datasets.get(second_ds_doc['id']).uris == [ 'file:///c', 'file:///d', 'file:///a', 'file:///b' ] # Ability to get datasets for a location # Add a second dataset with a different location (to catch lack of joins, filtering etc) second_ds_doc = copy.deepcopy(_telemetry_dataset) second_ds_doc['id'] = '366f32d8-e1f8-11e6-94b4-185e0f80a5c0' index.datasets.add( Dataset(type_, second_ds_doc, uris=[second_file.as_uri()], sources={})) for mode in ('exact', 'prefix', None): dataset_ids = [ d.id for d in index.datasets.get_datasets_for_location( first_file.as_uri(), mode=mode) ] assert dataset_ids == [dataset.id] assert list( index.datasets.get_datasets_for_location(first_file.as_uri() + "#part=100")) == [] with pytest.raises(ValueError): list( index.datasets.get_datasets_for_location(first_file.as_uri(), mode="nosuchmode"))
def test_index_dataset_with_location(index, default_metadata_type): """ :type index: datacube.index._api.Index :type default_metadata_type: datacube.model.MetadataType """ first_file = Path('/tmp/first/something.yaml').absolute() second_file = Path('/tmp/second/something.yaml').absolute() type_ = index.products.add_document(_pseudo_telemetry_dataset_type) dataset = Dataset(type_, _telemetry_dataset, uris=[first_file.as_uri()], sources={}) index.datasets.add(dataset) stored = index.datasets.get(dataset.id) assert stored.id == _telemetry_uuid # TODO: Dataset types? assert stored.type.id == type_.id assert stored.metadata_type.id == default_metadata_type.id assert stored.local_path == Path(first_file) # Ingesting again should have no effect. index.datasets.add(dataset) stored = index.datasets.get(dataset.id) locations = index.datasets.get_locations(dataset.id) assert len(locations) == 1 # Remove the location was_removed = index.datasets.remove_location(dataset.id, first_file.as_uri()) assert was_removed was_removed = index.datasets.remove_location(dataset.id, first_file.as_uri()) assert not was_removed locations = index.datasets.get_locations(dataset.id) assert len(locations) == 0 # Re-add the location was_added = index.datasets.add_location(dataset.id, first_file.as_uri()) assert was_added was_added = index.datasets.add_location(dataset.id, first_file.as_uri()) assert not was_added locations = index.datasets.get_locations(dataset.id) assert len(locations) == 1 # A rough date is ok: 1:01 beforehand just in case someone runs this during daylight savings time conversion :) # (any UTC conversion errors will be off by much more than this for PST/AEST) before_archival_dt = utc_now() - datetime.timedelta(hours=1, minutes=1) was_archived = index.datasets.archive_location(dataset.id, first_file.as_uri()) assert was_archived locations = index.datasets.get_locations(dataset.id) assert locations == [] locations = index.datasets.get_archived_locations(dataset.id) assert locations == [first_file.as_uri()] # It should return the time archived. location_times = index.datasets.get_archived_location_times(dataset.id) assert len(location_times) == 1 location, archived_time = location_times[0] assert location == first_file.as_uri() assert utc_now() > archived_time > before_archival_dt was_restored = index.datasets.restore_location(dataset.id, first_file.as_uri()) assert was_restored locations = index.datasets.get_locations(dataset.id) assert len(locations) == 1 # Ingesting with a new path should add the second one too. dataset.uris = [second_file.as_uri()] index.datasets.add(dataset) stored = index.datasets.get(dataset.id) locations = index.datasets.get_locations(dataset.id) assert len(locations) == 2 # Newest to oldest. assert locations == [second_file.as_uri(), first_file.as_uri()] # And the second one is newer, so it should be returned as the default local path: assert stored.local_path == Path(second_file) # Can archive and restore the first file, and location order is preserved was_archived = index.datasets.archive_location(dataset.id, first_file.as_uri()) assert was_archived locations = index.datasets.get_locations(dataset.id) assert locations == [second_file.as_uri()] was_restored = index.datasets.restore_location(dataset.id, first_file.as_uri()) assert was_restored locations = index.datasets.get_locations(dataset.id) assert locations == [second_file.as_uri(), first_file.as_uri()] # Can archive and restore the second file, and location order is preserved was_archived = index.datasets.archive_location(dataset.id, second_file.as_uri()) assert was_archived locations = index.datasets.get_locations(dataset.id) assert locations == [first_file.as_uri()] was_restored = index.datasets.restore_location(dataset.id, second_file.as_uri()) assert was_restored locations = index.datasets.get_locations(dataset.id) assert locations == [second_file.as_uri(), first_file.as_uri()] # Ingestion again without location should have no effect. dataset.uri = None index.datasets.add(dataset) stored = index.datasets.get(dataset.id) locations = index.datasets.get_locations(dataset.id) assert len(locations) == 2 # Newest to oldest. assert locations == [second_file.as_uri(), first_file.as_uri()] # And the second one is newer, so it should be returned as the default local path: assert stored.local_path == Path(second_file) # Ability to get datasets for a location # Add a second dataset with a different location (to catch lack of joins, filtering etc) second_ds_doc = copy.deepcopy(_telemetry_dataset) second_ds_doc['id'] = '366f32d8-e1f8-11e6-94b4-185e0f80a5c0' index.datasets.add( Dataset(type_, second_ds_doc, uris=[second_file.as_uri()], sources={})) dataset_ids = [ d.id for d in index.datasets.get_datasets_for_location(first_file.as_uri()) ] assert dataset_ids == [dataset.id]