示例#1
0
def test_index_dataset_with_location(index: Index,
                                     default_metadata_type: MetadataType):
    first_file = Path('/tmp/first/something.yaml').absolute()
    second_file = Path('/tmp/second/something.yaml').absolute()

    type_ = index.products.add_document(_pseudo_telemetry_dataset_type)
    dataset = Dataset(type_,
                      _telemetry_dataset,
                      uris=[first_file.as_uri()],
                      sources={})
    index.datasets.add(dataset)
    stored = index.datasets.get(dataset.id)

    assert stored.id == _telemetry_uuid
    # TODO: Dataset types?
    assert stored.type.id == type_.id
    assert stored.metadata_type.id == default_metadata_type.id
    assert stored.local_path == Path(first_file)

    # Ingesting again should have no effect.
    index.datasets.add(dataset)
    stored = index.datasets.get(dataset.id)
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 1
    # Remove the location
    was_removed = index.datasets.remove_location(dataset.id,
                                                 first_file.as_uri())
    assert was_removed
    was_removed = index.datasets.remove_location(dataset.id,
                                                 first_file.as_uri())
    assert not was_removed
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 0
    # Re-add the location
    was_added = index.datasets.add_location(dataset.id, first_file.as_uri())
    assert was_added
    was_added = index.datasets.add_location(dataset.id, first_file.as_uri())
    assert not was_added
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 1

    # A rough date is ok: 1:01 beforehand just in case someone runs this during daylight savings time conversion :)
    # (any UTC conversion errors will be off by much more than this for PST/AEST)
    before_archival_dt = utc_now() - datetime.timedelta(hours=1, minutes=1)

    was_archived = index.datasets.archive_location(dataset.id,
                                                   first_file.as_uri())
    assert was_archived
    locations = index.datasets.get_locations(dataset.id)
    assert locations == []
    locations = index.datasets.get_archived_locations(dataset.id)
    assert locations == [first_file.as_uri()]

    # It should return the time archived.
    location_times = index.datasets.get_archived_location_times(dataset.id)
    assert len(location_times) == 1
    location, archived_time = location_times[0]
    assert location == first_file.as_uri()
    assert utc_now() > archived_time > before_archival_dt

    was_restored = index.datasets.restore_location(dataset.id,
                                                   first_file.as_uri())
    assert was_restored
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 1

    # Indexing with a new path should NOT add the second one.
    dataset.uris = [second_file.as_uri()]
    index.datasets.add(dataset)
    stored = index.datasets.get(dataset.id)
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 1

    # Add location manually instead
    index.datasets.add_location(dataset.id, second_file.as_uri())
    stored = index.datasets.get(dataset.id)
    assert len(stored.uris) == 2

    # Newest to oldest.
    assert stored.uris == [second_file.as_uri(), first_file.as_uri()]
    # And the second one is newer, so it should be returned as the default local path:
    assert stored.local_path == Path(second_file)

    # Can archive and restore the first file, and location order is preserved
    was_archived = index.datasets.archive_location(dataset.id,
                                                   first_file.as_uri())
    assert was_archived
    locations = index.datasets.get_locations(dataset.id)
    assert locations == [second_file.as_uri()]
    was_restored = index.datasets.restore_location(dataset.id,
                                                   first_file.as_uri())
    assert was_restored
    locations = index.datasets.get_locations(dataset.id)
    assert locations == [second_file.as_uri(), first_file.as_uri()]

    # Can archive and restore the second file, and location order is preserved
    was_archived = index.datasets.archive_location(dataset.id,
                                                   second_file.as_uri())
    assert was_archived
    locations = index.datasets.get_locations(dataset.id)
    assert locations == [first_file.as_uri()]
    was_restored = index.datasets.restore_location(dataset.id,
                                                   second_file.as_uri())
    assert was_restored
    locations = index.datasets.get_locations(dataset.id)
    assert locations == [second_file.as_uri(), first_file.as_uri()]

    # Indexing again without location should have no effect.
    dataset.uris = []
    index.datasets.add(dataset)
    stored = index.datasets.get(dataset.id)
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 2
    # Newest to oldest.
    assert locations == [second_file.as_uri(), first_file.as_uri()]
    # And the second one is newer, so it should be returned as the default local path:
    assert stored.local_path == Path(second_file)

    # Check order of uris is preserved when indexing with more than one
    second_ds_doc = copy.deepcopy(_telemetry_dataset)
    second_ds_doc['id'] = '366f32d8-e1f8-11e6-94b4-185e0f80a589'
    index.datasets.add(
        Dataset(type_,
                second_ds_doc,
                uris=['file:///a', 'file:///b'],
                sources={}))

    # test order using get_locations function
    assert index.datasets.get_locations(
        second_ds_doc['id']) == ['file:///a', 'file:///b']

    # test order using datasets.get(), it has custom query as it turns out
    assert index.datasets.get(
        second_ds_doc['id']).uris == ['file:///a', 'file:///b']

    # test update, this should prepend file:///c, file:///d to the existing list
    index.datasets.update(
        Dataset(type_,
                second_ds_doc,
                uris=['file:///a', 'file:///c', 'file:///d'],
                sources={}))
    assert index.datasets.get_locations(second_ds_doc['id']) == [
        'file:///c', 'file:///d', 'file:///a', 'file:///b'
    ]
    assert index.datasets.get(second_ds_doc['id']).uris == [
        'file:///c', 'file:///d', 'file:///a', 'file:///b'
    ]

    # Ability to get datasets for a location
    # Add a second dataset with a different location (to catch lack of joins, filtering etc)
    second_ds_doc = copy.deepcopy(_telemetry_dataset)
    second_ds_doc['id'] = '366f32d8-e1f8-11e6-94b4-185e0f80a5c0'
    index.datasets.add(
        Dataset(type_, second_ds_doc, uris=[second_file.as_uri()], sources={}))
    for mode in ('exact', 'prefix', None):
        dataset_ids = [
            d.id for d in index.datasets.get_datasets_for_location(
                first_file.as_uri(), mode=mode)
        ]
        assert dataset_ids == [dataset.id]

    assert list(
        index.datasets.get_datasets_for_location(first_file.as_uri() +
                                                 "#part=100")) == []

    with pytest.raises(ValueError):
        list(
            index.datasets.get_datasets_for_location(first_file.as_uri(),
                                                     mode="nosuchmode"))
def test_index_dataset_with_location(index, default_metadata_type):
    """
    :type index: datacube.index._api.Index
    :type default_metadata_type: datacube.model.MetadataType
    """
    first_file = Path('/tmp/first/something.yaml').absolute()
    second_file = Path('/tmp/second/something.yaml').absolute()

    type_ = index.products.add_document(_pseudo_telemetry_dataset_type)
    dataset = Dataset(type_,
                      _telemetry_dataset,
                      uris=[first_file.as_uri()],
                      sources={})
    index.datasets.add(dataset)
    stored = index.datasets.get(dataset.id)

    assert stored.id == _telemetry_uuid
    # TODO: Dataset types?
    assert stored.type.id == type_.id
    assert stored.metadata_type.id == default_metadata_type.id
    assert stored.local_path == Path(first_file)

    # Ingesting again should have no effect.
    index.datasets.add(dataset)
    stored = index.datasets.get(dataset.id)
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 1
    # Remove the location
    was_removed = index.datasets.remove_location(dataset.id,
                                                 first_file.as_uri())
    assert was_removed
    was_removed = index.datasets.remove_location(dataset.id,
                                                 first_file.as_uri())
    assert not was_removed
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 0
    # Re-add the location
    was_added = index.datasets.add_location(dataset.id, first_file.as_uri())
    assert was_added
    was_added = index.datasets.add_location(dataset.id, first_file.as_uri())
    assert not was_added
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 1

    # A rough date is ok: 1:01 beforehand just in case someone runs this during daylight savings time conversion :)
    # (any UTC conversion errors will be off by much more than this for PST/AEST)
    before_archival_dt = utc_now() - datetime.timedelta(hours=1, minutes=1)

    was_archived = index.datasets.archive_location(dataset.id,
                                                   first_file.as_uri())
    assert was_archived
    locations = index.datasets.get_locations(dataset.id)
    assert locations == []
    locations = index.datasets.get_archived_locations(dataset.id)
    assert locations == [first_file.as_uri()]

    # It should return the time archived.
    location_times = index.datasets.get_archived_location_times(dataset.id)
    assert len(location_times) == 1
    location, archived_time = location_times[0]
    assert location == first_file.as_uri()
    assert utc_now() > archived_time > before_archival_dt

    was_restored = index.datasets.restore_location(dataset.id,
                                                   first_file.as_uri())
    assert was_restored
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 1

    # Ingesting with a new path should add the second one too.
    dataset.uris = [second_file.as_uri()]
    index.datasets.add(dataset)
    stored = index.datasets.get(dataset.id)
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 2
    # Newest to oldest.
    assert locations == [second_file.as_uri(), first_file.as_uri()]
    # And the second one is newer, so it should be returned as the default local path:
    assert stored.local_path == Path(second_file)

    # Can archive and restore the first file, and location order is preserved
    was_archived = index.datasets.archive_location(dataset.id,
                                                   first_file.as_uri())
    assert was_archived
    locations = index.datasets.get_locations(dataset.id)
    assert locations == [second_file.as_uri()]
    was_restored = index.datasets.restore_location(dataset.id,
                                                   first_file.as_uri())
    assert was_restored
    locations = index.datasets.get_locations(dataset.id)
    assert locations == [second_file.as_uri(), first_file.as_uri()]

    # Can archive and restore the second file, and location order is preserved
    was_archived = index.datasets.archive_location(dataset.id,
                                                   second_file.as_uri())
    assert was_archived
    locations = index.datasets.get_locations(dataset.id)
    assert locations == [first_file.as_uri()]
    was_restored = index.datasets.restore_location(dataset.id,
                                                   second_file.as_uri())
    assert was_restored
    locations = index.datasets.get_locations(dataset.id)
    assert locations == [second_file.as_uri(), first_file.as_uri()]

    # Ingestion again without location should have no effect.
    dataset.uri = None
    index.datasets.add(dataset)
    stored = index.datasets.get(dataset.id)
    locations = index.datasets.get_locations(dataset.id)
    assert len(locations) == 2
    # Newest to oldest.
    assert locations == [second_file.as_uri(), first_file.as_uri()]
    # And the second one is newer, so it should be returned as the default local path:
    assert stored.local_path == Path(second_file)

    # Ability to get datasets for a location
    # Add a second dataset with a different location (to catch lack of joins, filtering etc)
    second_ds_doc = copy.deepcopy(_telemetry_dataset)
    second_ds_doc['id'] = '366f32d8-e1f8-11e6-94b4-185e0f80a5c0'
    index.datasets.add(
        Dataset(type_, second_ds_doc, uris=[second_file.as_uri()], sources={}))
    dataset_ids = [
        d.id
        for d in index.datasets.get_datasets_for_location(first_file.as_uri())
    ]
    assert dataset_ids == [dataset.id]