def enforce_uniqueness_constraints(graph: ProvDocument) -> ProvDocument: """Enforce model uniqueness constraints. Remove node duplicates: - ProvDocument.unified takes care of this by removing nodes with the same id. Remove relation duplicates: - Allow only one relation of a certain type between two nodes. Enforcing this constraint after having populated the model instead of during population simplifies the model creation. """ records, known = [], set() for relation in graph.get_records(ProvRelation): (_, source), (_, target) = relation.formal_attributes[:2] rel_tuple = (type(relation), source, target) if rel_tuple in known: continue known.add(rel_tuple) records.append(relation) records.extend(graph.get_records(ProvElement)) g = ProvDocument(records) return g.unified()
def calculate_flat_provenance_types( prov_doc: ProvDocument, to_level: int = 0, including_primitives_types: bool = True, counting_wdf_as_two: bool = False, ignored_types: Iterable[str] = ϕ, ) -> MultiLevelTypeDict: # flatten all the bundles, if any prov_doc = prov_doc.flattened() # initialise index structures level0_types = defaultdict( set) # type: Dict[QualifiedName, Set[QualifiedName]] predecessors = defaultdict( set ) # type: Dict[QualifiedName, Set[Tuple[QualifiedName, QualifiedName]]] types_to_ignore: FrozenSet[str] = frozenset(ignored_types) # indexing node types and relations for rec in prov_doc.get_records(): # type: ProvRecord if rec.is_element(): level0_types[rec.identifier] |= get_element_types( rec, including_primitives_types, types_to_ignore) elif rec.is_relation(): rel_type = rec.get_type() attrs, values = zip(*rec.formal_attributes) # expecting a QualifiedName from the first argument of a relation predecessor, successor = values[:2] if predecessor is not None and successor is not None: predecessors[successor].add((rel_type, predecessor)) # the type map for this graph fp_types = defaultdict(dict) # type: MultiLevelTypeDict # converting type sets to FlatProvenanceType level 0 fp_types[0] = { node: (frozenset(level0_types[node]), ) for node in level0_types } # propagating level-0 types to the specified level for k in range(1, to_level + 1): # only propagating (k-1) types from nodes that have them for node, types in fp_types[k - 1].items(): # propagating the types to the predecessors for rel_type, predecessor in predecessors[node]: k_type = types + (frozenset({rel_type}), ) # type: FlatProvenanceType if counting_wdf_as_two and (rel_type == PROV_DERIVATION): k_p1_type = k_type + (frozenset({rel_type}), ) # type: FlatProvenanceType fp_types[k + 1][predecessor] = ( join_flat_types(fp_types[k + 1][predecessor], k_p1_type) if predecessor in fp_types[k + 1] else k_p1_type) else: fp_types[k][predecessor] = (join_flat_types( fp_types[k][predecessor], k_type) if predecessor in fp_types[k] else k_type) return fp_types
def count_record_types(prov_doc: ProvDocument) -> dict: counter = Counter(map(ProvRecord.get_type, prov_doc.get_records())) counter.update( map( ProvRecord.get_type, chain.from_iterable(map(ProvBundle.get_records, prov_doc.bundles)), )) result = dict( (PROV_N_MAP[rec_type], count) for rec_type, count in counter.items()) return result
def test_document_update_simple(self): d1 = ProvDocument() d1.set_default_namespace(EX_URI) d1.entity('e') b1 = d1.bundle('b1') b1.entity('e') d2 = ProvDocument() d2.set_default_namespace(EX_URI) d2.entity('e') b1 = d2.bundle('b1') b1.entity('e') b2 = d2.bundle('b2') b2.entity('e') self.assertRaises(ProvException, lambda: d1.update(1)) d1.update(d2) self.assertEqual(len(d1.get_records()), 2) self.assertEqual(len(d1.bundles), 2)
def test_document_update_simple(self): d1 = ProvDocument() d1.set_default_namespace(EX_URI) d1.entity('e') b1 = d1.bundle('b1') b1.entity('e') d2 = ProvDocument() d2.set_default_namespace(EX_URI) d2.entity('e') b1 = d2.bundle('b1') b1.entity('e') b2 = d2.bundle('b2') b2.entity('e') self.assertRaises(ProvException, lambda: d1.update(1)) d1.update(d2) self.assertEqual(len(d1.get_records()), 2) self.assertEqual(len(d1.bundles), 2)