Пример #1
0
def test_dedup():
    ds0 = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True))

    # make sure ds0 has duplicate C nodes with equivalent data
    assert ds0.sources['ab'].sources['bc'].doc is not ds0.sources['ac'].doc
    assert ds0.sources['ab'].sources['bc'].doc == ds0.sources['ac'].doc

    ds = SimpleDocNav(dedup_lineage(ds0))
    assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc
    assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources['ac'].sources['cd'].doc

    # again but with raw doc
    ds = SimpleDocNav(dedup_lineage(ds0.doc))
    assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc
    assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources['ac'].sources['cd'].doc

    # Test that we detect inconsistent metadata for duplicate entries
    ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True))
    ds0.sources['ac'].doc['label'] = 'Modified'
    ds0 = SimpleDocNav(ds0.doc)
    assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc

    with pytest.raises(InvalidDocException, match=r'Inconsistent metadata .*'):
        dedup_lineage(ds0)

    # Test that we detect inconsistent lineage subtrees for duplicate entries

    # Subtest 1: different set of keys
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['cd'] = {}
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)

    # Subtest 2: different values for "child" nodes
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['cd']['id'] = '7fe57724-ed44-4beb-a3ab-c275339049be'
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)

    # Subtest 3: different name for child
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['CD'] = srcs['cd']
    del srcs['cd']
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)
def check_with_existing_lineage(clirunner, index):
    """
      A -> B
      |    |
      |    v
      +--> C -> D
      |
      +--> E

    Add nodes BCE(D) with auto-matching, then add node A with product restricted to A only.
    """

    ds = SimpleDocNav(gen_dataset_test_dag(33, force_tree=True))

    child_docs = [ds.sources[x].doc for x in ('ab', 'ac', 'ae')]

    prefix = write_files({
        'lineage.yml': yaml.safe_dump_all(child_docs),
        'main.yml': yaml.safe_dump(ds.doc),
    })

    clirunner(['dataset', 'add', str(prefix / 'lineage.yml')])
    assert index.datasets.get(ds.sources['ae'].id) is not None
    assert index.datasets.get(ds.sources['ab'].id) is not None
    assert index.datasets.get(ds.sources['ac'].id) is not None

    clirunner([
        'dataset', 'add', '--no-auto-add-lineage', '--product', 'A',
        str(prefix / 'main.yml')
    ])

    assert index.datasets.get(ds.id) is not None
def check_missing_lineage(clirunner, index):
    """
      A -> B
      |    |
      |    v
      +--> C -> D
      |
      +--> E

    Use --no-auto-add-lineage
    """
    ds = SimpleDocNav(gen_dataset_test_dag(44, force_tree=True))
    child_docs = [ds.sources[x].doc for x in ('ae', 'ab', 'ac')]

    prefix = write_files({
        'lineage.yml': yaml.safe_dump_all(child_docs),
        'main.yml': yaml.safe_dump(ds.doc),
    })

    r = clirunner(
        ['dataset', 'add', '--no-auto-add-lineage',
         str(prefix / 'main.yml')])

    assert 'ERROR Following lineage datasets are missing' in r.output
    assert index.datasets.has(ds.id) is False

    # now add lineage and try again
    clirunner(['dataset', 'add', str(prefix / 'lineage.yml')])
    assert index.datasets.has(ds.sources['ae'].id)
    r = clirunner(
        ['dataset', 'add', '--no-auto-add-lineage',
         str(prefix / 'main.yml')])

    assert index.datasets.has(ds.id)
def check_no_product_match(clirunner, index):
    ds = SimpleDocNav(gen_dataset_test_dag(22, force_tree=True))

    prefix = write_files({'agdc-metadata.yml': yaml.safe_dump(ds.doc)})

    r = clirunner(['dataset', 'add',
                   '--product', 'A',
                   str(prefix)])
    assert 'ERROR Dataset metadata did not match product signature' in r.output

    r = clirunner(['dataset', 'add',
                   '--product', 'A',
                   '--product', 'B',
                   str(prefix)])
    assert 'ERROR No matching Product found for dataset' in r.output

    ds_ = index.datasets.get(ds.id, include_sources=True)
    assert ds_ is None

    # Ignore lineage but fail to match main dataset
    r = clirunner(['dataset', 'add',
                   '--product', 'B',
                   '--confirm-ignore-lineage',
                   str(prefix)])

    assert 'ERROR Dataset metadata did not match product signature' in r.output
    assert index.datasets.has(ds.id) is False
Пример #5
0
def test_dataset_add(dataset_add_configs, index_empty, clirunner):
    p = dataset_add_configs
    index = index_empty
    r = clirunner(['dataset', 'add', p.datasets], expect_success=False)
    assert r.exit_code != 0
    assert 'Found no products' in r.output

    clirunner(['metadata_type', 'add', p.metadata])
    clirunner(['product', 'add', p.products])
    clirunner(['dataset', 'add', p.datasets])
    clirunner(['dataset', 'add', p.datasets_bad1])

    ds = load_dataset_definition(p.datasets)
    ds_bad1 = load_dataset_definition(p.datasets_bad1)

    r = clirunner(['dataset', 'search'], expect_success=True)
    assert ds.id in r.output
    assert ds_bad1.id not in r.output
    assert ds.sources['ab'].id in r.output
    assert ds.sources['ac'].sources['cd'].id in r.output

    r = clirunner(['dataset', 'info', '-f', 'csv', ds.id])
    assert ds.id in r.output

    r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id])
    assert ds.sources['ae'].id in r.output

    r = clirunner([
        'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id
    ])
    assert ds.id in r.output

    ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True))
    assert ds_.id == ds.id

    x = index.datasets.get(ds.id, include_sources=True)
    assert str(x.sources['ab'].id) == ds.sources['ab'].id
    assert str(
        x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id

    check_skip_lineage_test(clirunner, index)
    check_no_product_match(clirunner, index)
    check_with_existing_lineage(clirunner, index)
    check_inconsistent_lineage(clirunner, index)
    check_missing_metadata_doc(clirunner)
    check_missing_lineage(clirunner, index)
    check_no_confirm(clirunner, p.datasets)
    check_bad_yaml(clirunner, index)

    # check --product=nosuchproduct
    r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets],
                  expect_success=False)

    assert "ERROR Supplied product name" in r.output
    assert r.exit_code != 0

    # Check that deprecated option is accepted
    r = clirunner(['dataset', 'add', '--auto-match', p.datasets])
    assert 'WARNING --auto-match option is deprecated' in r.output
Пример #6
0
def test_remap_lineage_doc():
    def mk_node(ds, sources):
        return dict(id=ds.id, **sources)

    ds = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True))
    xx = remap_lineage_doc(ds, mk_node)
    assert xx['id'] == ds.id
    assert xx['ac']['id'] == ds.sources['ac'].id

    xx = remap_lineage_doc(ds.doc, mk_node)
    assert xx['id'] == ds.id
    assert xx['ac']['id'] == ds.sources['ac'].id
def check_skip_lineage_test(clirunner, index):
    ds = SimpleDocNav(gen_dataset_test_dag(11, force_tree=True))

    prefix = write_files({'agdc-metadata.yml': yaml.safe_dump(ds.doc)})

    clirunner(['dataset', 'add', '--confirm-ignore-lineage', '--product', 'A', str(prefix)])

    ds_ = index.datasets.get(ds.id, include_sources=True)
    assert ds_ is not None
    assert str(ds_.id) == ds.id
    assert ds_.sources == {}

    assert index.datasets.get(ds.sources['ab'].id) is None
    assert index.datasets.get(ds.sources['ac'].id) is None
    assert index.datasets.get(ds.sources['ae'].id) is None
    assert index.datasets.get(ds.sources['ac'].sources['cd'].id) is None
def check_inconsistent_lineage(clirunner, index):
    """
      A -> B
      |    |
      |    v
      +--> C -> D
      |
      +--> E

    Add node E,
    then try adding A with modified E in the lineage, should fail to add ABCD
    """
    ds = SimpleDocNav(gen_dataset_test_dag(1313, force_tree=True))

    child_docs = [ds.sources[x].doc for x in ('ae', )]
    modified_doc = toolz.assoc_in(
        ds.doc, 'lineage.source_datasets.ae.label'.split('.'), 'modified')

    prefix = write_files({
        'lineage.yml': yaml.safe_dump_all(child_docs),
        'main.yml': yaml.safe_dump(modified_doc),
    })

    clirunner(['dataset', 'add', str(prefix / 'lineage.yml')])
    assert index.datasets.get(ds.sources['ae'].id) is not None

    r = clirunner(['dataset', 'add', str(prefix / 'main.yml')])

    assert 'ERROR Inconsistent lineage dataset' in r.output

    assert index.datasets.has(ds.id) is False
    assert index.datasets.has(ds.sources['ab'].id) is False
    assert index.datasets.has(ds.sources['ac'].id) is False
    assert index.datasets.has(ds.sources['ac'].sources['cd'].id) is False

    # now again but skipping verification check
    r = clirunner(
        ['dataset', 'add', '--no-verify-lineage',
         str(prefix / 'main.yml')])

    assert index.datasets.has(ds.id)
    assert index.datasets.has(ds.sources['ab'].id)
    assert index.datasets.has(ds.sources['ac'].id)
    assert index.datasets.has(ds.sources['ac'].sources['cd'].id)
def test_dataset_add(dataset_add_configs, index_empty, clirunner):
    p = dataset_add_configs
    index = index_empty
    r = clirunner(['dataset', 'add', p.datasets], expect_success=False)
    assert r.exit_code != 0
    assert 'Found no products' in r.output

    clirunner(['metadata', 'add', p.metadata])
    clirunner(['product', 'add', p.products])
    clirunner(['dataset', 'add', p.datasets])
    clirunner(['dataset', 'add', p.datasets_bad1])
    clirunner(['dataset', 'add', p.datasets_eo3])

    ds = load_dataset_definition(p.datasets)
    ds_bad1 = load_dataset_definition(p.datasets_bad1)

    # Check .hl.Doc2Dataset
    doc2ds = Doc2Dataset(index)
    _ds, _err = doc2ds(ds.doc, 'file:///something')
    assert _err is None
    assert str(_ds.id) == ds.id
    assert _ds.metadata_doc == ds.doc

    # Check dataset search

    r = clirunner(['dataset', 'search'], expect_success=True)
    assert ds.id in r.output
    assert ds_bad1.id not in r.output
    assert ds.sources['ab'].id in r.output
    assert ds.sources['ac'].sources['cd'].id in r.output

    r = clirunner(['dataset', 'info', '-f', 'csv', ds.id])
    assert ds.id in r.output

    r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id])
    assert ds.sources['ae'].id in r.output

    r = clirunner([
        'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id
    ])
    assert ds.id in r.output

    ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True))
    assert ds_.id == ds.id

    x = index.datasets.get(ds.id, include_sources=True)
    assert str(x.sources['ab'].id) == ds.sources['ab'].id
    assert str(
        x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id

    check_skip_lineage_test(clirunner, index)
    check_no_product_match(clirunner, index)
    check_with_existing_lineage(clirunner, index)
    check_inconsistent_lineage(clirunner, index)
    check_missing_metadata_doc(clirunner)
    check_missing_lineage(clirunner, index)
    check_no_confirm(clirunner, p.datasets)
    check_bad_yaml(clirunner, index)

    # check --product=nosuchproduct
    r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets],
                  expect_success=False)

    assert "ERROR Supplied product name" in r.output
    assert r.exit_code != 0

    # Check that deprecated option is accepted
    r = clirunner(['dataset', 'add', '--auto-match', p.datasets])
    assert 'WARNING --auto-match option is deprecated' in r.output

    # test dataset add eo3
    r = clirunner(['dataset', 'add', p.datasets_eo3])
    assert r.exit_code == 0

    ds_eo3 = load_dataset_definition(p.datasets_eo3)
    assert ds_eo3.location is not None

    _ds = index.datasets.get(ds_eo3.id, include_sources=True)
    assert sorted(_ds.sources) == ['a', 'bc1', 'bc2']
    assert _ds.crs == 'EPSG:3857'
    assert _ds.extent is not None
    assert _ds.extent.crs == _ds.crs
    assert _ds.uris == [ds_eo3.location]
    assert 'location' not in _ds.metadata_doc