def base_entity_match(ingested_entity: EntityTree, db_entity: EntityTree) -> bool: """ Matching logic for comparing entities that might not have external ids, by comparing all flat fields in the given entities. Should only be used for entities that we know might not have external_ids based on the ingested state data. """ a = cast(ExternalIdEntity, ingested_entity.entity) b = cast(ExternalIdEntity, db_entity.entity) # Placeholders never match if is_placeholder(a) or is_placeholder(b): return False # Compare external ids if one is present if a.external_id or b.external_id: return a.external_id == b.external_id # Compare all flat fields of the two entities all_set_flat_field_names = \ get_set_entity_field_names(a, EntityFieldType.FLAT_FIELD) | \ get_set_entity_field_names(b, EntityFieldType.FLAT_FIELD) for field_name in all_set_flat_field_names: # Skip primary key if field_name == a.get_class_id_name(): continue a_field = get_field(a, field_name) b_field = get_field(b, field_name) if a_field != b_field: return False return True
def _is_subset(entity: Entity, subset: Entity) -> bool: """Checks if all fields on the provided |subset| are present in the provided |entity|. Returns True if so, otherwise False. """ for field_name in get_set_entity_field_names(subset, EntityFieldType.FLAT_FIELD): if get_field(entity, field_name) != get_field(subset, field_name): return False for field_name in get_set_entity_field_names(subset, EntityFieldType.FORWARD_EDGE): for field in get_field_as_list(subset, field_name): if field not in get_field_as_list(entity, field_name): return False return True
def test_getEntityRelationshipFieldNames_flatFields(self): entity = StateSentenceGroup.new_with_defaults( fines=[StateFine.new_with_defaults()], person=[StatePerson.new_with_defaults()], sentence_group_id=_ID) self.assertEqual( {'sentence_group_id'}, get_set_entity_field_names(entity, EntityFieldType.FLAT_FIELD))
def _get_all_entity_trees_of_cls_helper( tree: EntityTree, cls: Type[DatabaseEntity], seen_ids: Set[int], seen_trees: List[EntityTree], direction_checker: SchemaEdgeDirectionChecker): """ Finds all objects in the provided |tree| graph which have the type |cls|. When an object of type |cls| is found, updates the provided |seen_ids| and |seen_trees| with the object's id and EntityTree respectively. """ entity = tree.entity entity_cls = entity.__class__ # If |cls| is higher ranked than |entity_cls|, it is impossible to reach # an object of type |cls| from the current entity. if direction_checker.is_higher_ranked(cls, entity_cls): return if entity_cls == cls and id(entity) not in seen_ids: seen_ids.add(id(entity)) seen_trees.append(tree) return for child_field_name in get_set_entity_field_names( entity, EntityFieldType.FORWARD_EDGE): child_trees = tree.generate_child_trees( entity.get_field_as_list(child_field_name)) for child_tree in child_trees: _get_all_entity_trees_of_cls_helper( child_tree, cls, seen_ids, seen_trees, direction_checker)
def _get_match_results_for_all_children( ingested_entity_tree: EntityTree, db_entity_trees: List[EntityTree], root_entity_cls) \ -> List[Tuple[str, MatchResults]]: """Attempts to match all children of the |ingested_entity_tree| to children of the |db_entity_trees|. Matching for each child is independent and can match to different DB parents. Returns a list of tuples with the following values: - str: the string name of the child field - MatchResult: the result of matching this child field to children of the provided |db_entity_trees| """ results = [] ingested_entity = ingested_entity_tree.entity set_child_fields = get_set_entity_field_names(ingested_entity, EntityFieldType.FORWARD_EDGE) for child_field_name in set_child_fields: ingested_child_field = get_field(ingested_entity, child_field_name) db_child_trees = generate_child_entity_trees(child_field_name, db_entity_trees) if isinstance(ingested_child_field, list): ingested_child_list = ingested_child_field else: ingested_child_list = [ingested_child_field] ingested_child_trees = \ ingested_entity_tree.generate_child_trees(ingested_child_list) match_results = _match_entity_trees( ingested_entity_trees=ingested_child_trees, db_entity_trees=db_child_trees, root_entity_cls=root_entity_cls) results.append((child_field_name, match_results)) return results
def test_getEntityRelationshipFieldNames_backedges(self): entity = StateSentenceGroup.new_with_defaults( fines=[StateFine.new_with_defaults()], person=[StatePerson.new_with_defaults()], sentence_group_id=_ID) self.assertEqual( {'person'}, get_set_entity_field_names(entity, EntityFieldType.BACK_EDGE))
def test_getEntityRelationshipFieldNames_all(self): entity = schema.StateSentenceGroup(fines=[schema.StateFine()], person=schema.StatePerson(), person_id=_ID, sentence_group_id=_ID) self.assertEqual({'fines', 'person', 'person_id', 'sentence_group_id'}, get_set_entity_field_names(entity, EntityFieldType.ALL))
def test_getEntityRelationshipFieldNames_foreignKeys(self): entity = schema.StateSentenceGroup(fines=[schema.StateFine()], person=schema.StatePerson(), person_id=_ID, sentence_group_id=_ID) self.assertEqual({'person_id'}, get_set_entity_field_names( entity, EntityFieldType.FOREIGN_KEYS))
def test_getEntityRelationshipFieldNames_backedges(self): entity = schema.StateSentenceGroup(fines=[schema.StateFine()], person=schema.StatePerson(), person_id=_ID, sentence_group_id=_ID) self.assertEqual({'person'}, get_set_entity_field_names(entity, EntityFieldType.BACK_EDGE))
def test_getDbEntityRelationshipFieldNames_children(self): entity = schema.StateSentenceGroup(fines=[schema.StateFine()], person=schema.StatePerson(), person_id=_ID, sentence_group_id=_ID) self.assertEqual({'fines'}, get_set_entity_field_names( entity, EntityFieldType.FORWARD_EDGE))
def merge_incomplete_periods( new_entity: schema.StateIncarcerationPeriod, old_entity: schema.StateIncarcerationPeriod, ) -> schema.StateIncarcerationPeriod: """Merges two incarceration periods with information about admission and release into one period. Assumes the status of the release event is the most relevant, up-to-date status. Args: new_entity: The out-of-session period (i.e. new to this ingest run). old_entity: The in-session period (i.e. pulled out of the DB), if there is one. """ # Complete match, perform normal merge. if new_entity.external_id == old_entity.external_id: default_merge_flat_fields(new_entity=new_entity, old_entity=old_entity) return old_entity # Determine updated external_id new_complete = is_incarceration_period_complete(new_entity) old_complete = is_incarceration_period_complete(old_entity) if new_complete != old_complete: updated_external_id = ( new_entity.external_id if new_complete else old_entity.external_id ) else: admission_period, release_period = ( (new_entity, old_entity) if new_entity.admission_date else (old_entity, new_entity) ) updated_external_id = ( admission_period.external_id + _INCARCERATION_PERIOD_ID_DELIMITER + release_period.external_id ) # Keep the new status if the new period is a release period updated_status = new_entity.status if new_entity.release_date else old_entity.status updated_status_raw_text = ( new_entity.status_raw_text if new_entity.release_date else old_entity.status_raw_text ) # Copy all fields from new onto old new_fields = get_set_entity_field_names(new_entity, EntityFieldType.FLAT_FIELD) for child_field_name in new_fields: old_entity.set_field(child_field_name, new_entity.get_field(child_field_name)) # Always update the external id and status old_entity.external_id = updated_external_id old_entity.status = updated_status old_entity.status_raw_text = updated_status_raw_text return old_entity
def clear_db_ids(db_entities: Sequence[CoreEntity]): """Clears primary key fields off of all entities in all of the provided |db_entities| graphs. """ for entity in db_entities: entity.clear_id() for field_name in get_set_entity_field_names( entity, EntityFieldType.FORWARD_EDGE): clear_db_ids(entity.get_field_as_list(field_name))
def _base_entity_match( a: DatabaseEntity, b: DatabaseEntity, skip_fields: Set[str], allow_null_mismatch: bool = False ) -> bool: """Returns whether two objects of the same type are an entity match. Args: a: The first entity to match b: The second entity to match skip_fields: A list of names of fields that should be ignored when determining if two objects match based on flat fields. allow_null_mismatch: Allow for two objects to still match if one has a null value in a field where the other's is nonnull. """ # Placeholders never match if is_placeholder(a) or is_placeholder(b): return False # Compare external ids if one is present if a.get_external_id() or b.get_external_id(): return a.get_external_id() == b.get_external_id() # Compare all flat fields of the two entities all_set_flat_field_names = \ get_set_entity_field_names(a, EntityFieldType.FLAT_FIELD) | \ get_set_entity_field_names(b, EntityFieldType.FLAT_FIELD) for field_name in all_set_flat_field_names: # Skip primary key if field_name == a.get_class_id_name() or field_name in skip_fields: continue a_field = a.get_field(field_name) b_field = b.get_field(field_name) if allow_null_mismatch and (a_field is None or b_field is None): # Do not disqualify a match if one of the fields is null continue if a_field != b_field: return False return True
def _get_root_entity_helper(entity: Entity) -> Optional[Type]: if not is_placeholder(entity): return entity.__class__ for field_name in get_set_entity_field_names(entity, EntityFieldType.FORWARD_EDGE): field = get_field_as_list(entity, field_name)[0] result = _get_root_entity_helper(field) if result is not None: return result return None
def test_getEntityRelationshipFieldNames_backedges(self) -> None: entity = schema.StateSentenceGroup( state_code="US_XX", fines=[schema.StateFine()], person=schema.StatePerson(), person_id=_ID, sentence_group_id=_ID, ) self.assertEqual( {"person"}, get_set_entity_field_names(entity, EntityFieldType.BACK_EDGE) )
def test_getDbEntityRelationshipFieldNames_children(self) -> None: entity = schema.StateSentenceGroup( state_code="US_XX", fines=[schema.StateFine()], person=schema.StatePerson(), person_id=_ID, sentence_group_id=_ID, ) self.assertEqual( {"fines"}, get_set_entity_field_names(entity, EntityFieldType.FORWARD_EDGE) )
def _get_all_entities_of_cls_helper(entity: Entity, cls: Type, seen_ids: Set[int], seen_entities: List[Entity]): if isinstance(entity, cls) and id(entity) not in seen_ids: seen_ids.add(id(entity)) seen_entities.append(entity) return for child_field_name in get_set_entity_field_names( entity, EntityFieldType.FORWARD_EDGE): for child_field in get_field_as_list(entity, child_field_name): _get_all_entities_of_cls_helper(child_field, cls, seen_ids, seen_entities)
def test_getEntityRelationshipFieldNames_all(self) -> None: entity = schema.StateSentenceGroup( state_code="US_XX", fines=[schema.StateFine()], person=schema.StatePerson(), person_id=_ID, sentence_group_id=_ID, ) self.assertEqual( {"state_code", "fines", "person", "person_id", "sentence_group_id"}, get_set_entity_field_names(entity, EntityFieldType.ALL), )
def test_getEntityRelationshipFieldNames_foreignKeys(self) -> None: entity = schema.StateSentenceGroup( state_code="US_XX", fines=[schema.StateFine()], person=schema.StatePerson(), person_id=_ID, sentence_group_id=_ID, ) self.assertEqual( {"person_id"}, get_set_entity_field_names(entity, EntityFieldType.FOREIGN_KEYS), )
def _get_all_entities_of_type_helper(root: Entity, cls: Type, seen: Set[int], entities_of_type: List[Entity]): if isinstance(root, cls): if id(root) not in seen: root = cast(Entity, root) entities_of_type.append(root) return for field_name in get_set_entity_field_names(root, EntityFieldType.FORWARD_EDGE): for field in get_field_as_list(root, field_name): _get_all_entities_of_type_helper(field, cls, seen, entities_of_type)
def default_merge_flat_fields( *, new_entity: DatabaseEntity, old_entity: DatabaseEntity) -> DatabaseEntity: """Merges all set non-relationship fields on the |new_entity| onto the |old_entity|. Returns the newly merged entity.""" for child_field_name in get_set_entity_field_names(new_entity, EntityFieldType.FLAT_FIELD): if child_field_name == old_entity.get_class_id_name(): continue # Do not overwrite with default status if child_field_name == 'status' and new_entity.has_default_status(): continue old_entity.set_field(child_field_name, new_entity.get_field(child_field_name)) return old_entity
def _base_entity_match(a: DatabaseEntity, b: DatabaseEntity) -> bool: # Placeholders never match if is_placeholder(a) or is_placeholder(b): return False # Compare external ids if one is present if a.get_external_id() or b.get_external_id(): return a.get_external_id() == b.get_external_id() # Compare all flat fields of the two entities all_set_flat_field_names = \ get_set_entity_field_names(a, EntityFieldType.FLAT_FIELD) | \ get_set_entity_field_names(b, EntityFieldType.FLAT_FIELD) for field_name in all_set_flat_field_names: # Skip primary key if field_name == a.get_class_id_name(): continue a_field = a.get_field(field_name) b_field = b.get_field(field_name) if a_field != b_field: return False return True
def _populate_multiparent_map(entity: Entity, entity_cls: Type, multiparent_map: Dict[str, List[_EntityWithParents]]): """Looks through all children in the provided |entity|, and if they are of type |entity_cls|, adds an entry to the provided |multiparent_map|. """ for child_field_name in get_set_entity_field_names( entity, EntityFieldType.FORWARD_EDGE): linked_parent = _LinkedParents(entity, child_field_name) for child in get_field_as_list(entity, child_field_name): _populate_multiparent_map(child, entity_cls, multiparent_map) if not isinstance(child, entity_cls): continue # All persistence entities are ExternalIdEntities child = cast(ExternalIdEntity, child) external_id = child.external_id # We're only matching entities if they have the same # external_id. if not external_id: continue if external_id in multiparent_map.keys(): entities_with_parents = multiparent_map[external_id] found_entity = False # If the child object itself has already been seen, simply add # the |entity| parent to the list of linked parents for entity_with_parents in entities_with_parents: if id(entity_with_parents.entity) == id(child): found_entity = True entity_with_parents.linked_parents.append( linked_parent) # If the child object has not been seen, create a new # _EntityWithParents object for this external_id if not found_entity: entity_with_parents = \ _EntityWithParents(child, [linked_parent]) entities_with_parents.append(entity_with_parents) # If the external_id has never been seen before, create a new # entry for it. else: entity_with_parents = _EntityWithParents( child, [linked_parent]) multiparent_map[external_id] = [entity_with_parents]
def test_getEntityRelationshipFieldNames_children(self) -> None: entity = StateSentenceGroup.new_with_defaults( state_code="US_XX", status=StateSentenceStatus.PRESENT_WITHOUT_INFO, fines=[ StateFine.new_with_defaults( state_code="US_XX", status=StateFineStatus.PRESENT_WITHOUT_INFO ) ], person=[StatePerson.new_with_defaults(state_code="US_XX")], sentence_group_id=_ID, ) self.assertEqual( {"fines"}, get_set_entity_field_names(entity, EntityFieldType.FORWARD_EDGE) )
def _default_merge_flat_fields(*, new_entity: Entity, old_entity: Entity) \ -> Entity: """Merges all set non-relationship fields on the |new_entity| onto the |old_entity|. Returns the newly merged entity. """ for child_field_name in get_set_entity_field_names( new_entity, EntityFieldType.FLAT_FIELD): # Do not overwrite with default status if child_field_name == 'status' and has_default_status(new_entity): continue set_field(old_entity, child_field_name, get_field(new_entity, child_field_name)) return old_entity
def convert_to_placeholder(entity: DatabaseEntity): for field_name in get_set_entity_field_names(entity, EntityFieldType.FLAT_FIELD): if field_name == entity.get_class_id_name(): continue if field_name == 'state_code': continue if field_name == 'status': entity.set_field(field_name, enum_canonical_strings.present_without_info) continue if field_name == 'incarceration_type': entity.set_field(field_name, StateIncarcerationType.STATE_PRISON.value) continue if field_name == 'court_type': entity.set_field(field_name, StateCourtType.PRESENT_WITHOUT_INFO.value) continue if field_name == 'agent_type': entity.set_field(field_name, StateAgentType.PRESENT_WITHOUT_INFO.value) continue entity.clear_field(field_name)
def _match_placeholder_tree( *, ingested_placeholder_tree: EntityTree, db_entity_trees: List[EntityTree], matched_entities_by_db_ids: Dict[int, Entity], root_entity_cls) \ -> IndividualMatchResult: """Attempts to match the provided |ingested_placeholder_tree| to entities in the provided |db_entity_trees| based off any child matches. When such a match is found, the child is moved off of the ingested entity and onto the matched db entity. Returns the results of matching as an IndividualMatchResult. """ updated_entity_trees: List[EntityTree] = [] error_count = 0 match_results_by_child = _get_match_results_for_all_children( ingested_entity_tree=ingested_placeholder_tree, db_entity_trees=db_entity_trees, root_entity_cls=root_entity_cls) # Initialize so pylint doesn't yell. child_field_name = None child_match_result = None placeholder_children: List[Entity] = [] def resolve_child_match_result(): """Resolves any child matches by removing the child from the ingested placeholder entity and adding the child onto the corresponding DB entity. """ if not child_field_name or not child_match_result: raise EntityMatchingError( f"Expected child_field_name and child_match_result to be set, " f"but instead got {child_field_name} and {child_match_result} " f"respectively.", ingested_placeholder_tree.entity.get_entity_name()) # If the child wasn't matched, leave it on the placeholder object. if not child_match_result.merged_entity_trees: placeholder_children.append( child_match_result.ingested_entity_tree.entity) return # Ensure the merged children are on the correct entity for merged_child_tree in child_match_result.merged_entity_trees: merged_parent_tree = merged_child_tree.generate_parent_tree() # If one of the merged parents is the ingested placeholder entity, # simply keep track of the child in placeholder_children. if merged_parent_tree.entity == ingested_placeholder_tree.entity: placeholder_children.append( child_match_result.ingested_entity_tree.entity) continue add_child_to_entity(entity=merged_parent_tree.entity, child_field_name=child_field_name, child_to_add=merged_child_tree.entity) # Keep track of all db parents of the merged children. updated_entities = [m.entity for m in updated_entity_trees] if merged_parent_tree.entity not in updated_entities: _add_match_to_matched_entities_cache( db_entity_match=merged_parent_tree.entity, ingested_entity=ingested_placeholder_tree.entity, matched_entities_by_db_ids=matched_entities_by_db_ids) updated_entity_trees.append(merged_parent_tree) for child_field_name, match_results in match_results_by_child: placeholder_children = [] error_count += match_results.error_count for child_match_result in match_results.individual_match_results: resolve_child_match_result() set_field_from_list(ingested_placeholder_tree.entity, child_field_name, placeholder_children) # If we updated any of the entity trees, check to see if the placeholder # tree still has any children. If it doesn't have any children, it doesn't # need to be committed into our DB. if updated_entity_trees: set_child_fields = get_set_entity_field_names( ingested_placeholder_tree.entity, entity_field_type=EntityFieldType.FORWARD_EDGE) if set_child_fields: updated_entity_trees.append(ingested_placeholder_tree) return IndividualMatchResult( ingested_entity_tree=ingested_placeholder_tree, merged_entity_trees=updated_entity_trees, error_count=error_count)