def test_dedup(): ds0 = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) # make sure ds0 has duplicate C nodes with equivalent data assert ds0.sources['ab'].sources['bc'].doc is not ds0.sources['ac'].doc assert ds0.sources['ab'].sources['bc'].doc == ds0.sources['ac'].doc ds = SimpleDocNav(dedup_lineage(ds0)) assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources['ac'].sources['cd'].doc # again but with raw doc ds = SimpleDocNav(dedup_lineage(ds0.doc)) assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources['ac'].sources['cd'].doc # Test that we detect inconsistent metadata for duplicate entries ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True)) ds0.sources['ac'].doc['label'] = 'Modified' ds0 = SimpleDocNav(ds0.doc) assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc with pytest.raises(InvalidDocException, match=r'Inconsistent metadata .*'): dedup_lineage(ds0) # Test that we detect inconsistent lineage subtrees for duplicate entries # Subtest 1: different set of keys ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['cd'] = {} ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Subtest 2: different values for "child" nodes ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['cd']['id'] = '7fe57724-ed44-4beb-a3ab-c275339049be' ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Subtest 3: different name for child ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['CD'] = srcs['cd'] del srcs['cd'] ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0)
def resolve(main_ds, uri): try: main_ds = SimpleDocNav(dedup_lineage(main_ds)) except InvalidDocException as e: return None, e main_uuid = main_ds.id ds_by_uuid = toolz.valmap(toolz.first, flatten_datasets(main_ds)) all_uuid = list(ds_by_uuid) db_dss = {str(ds.id): ds for ds in index.datasets.bulk_get(all_uuid)} lineage_uuids = set(filter(lambda x: x != main_uuid, all_uuid)) missing_lineage = lineage_uuids - set(db_dss) if missing_lineage and fail_on_missing_lineage: return None, "Following lineage datasets are missing from DB: %s" % (','.join(missing_lineage)) if verify_lineage and not is_doc_eo3(main_ds.doc): bad_lineage = [] for uuid in lineage_uuids: if uuid in db_dss: ok, err = check_consistent(jsonify_document(ds_by_uuid[uuid].doc_without_lineage_sources), db_dss[uuid].metadata_doc) if not ok: bad_lineage.append((uuid, err)) if len(bad_lineage) > 0: error_report = '\n'.join('Inconsistent lineage dataset {}:\n> {}'.format(uuid, err) for uuid, err in bad_lineage) return None, error_report def with_cache(v, k, cache): cache[k] = v return v def resolve_ds(ds, sources, cache=None): cached = cache.get(ds.id) if cached is not None: return cached uris = [uri] if ds.id == main_uuid else [] doc = ds.doc db_ds = db_dss.get(ds.id) if db_ds: product = db_ds.type else: product = match_product(doc) return with_cache(Dataset(product, doc, uris=uris, sources=sources), ds.id, cache) try: return remap_lineage_doc(main_ds, resolve_ds, cache={}), None except BadMatch as e: return None, e