def prune_dangling_placeholders_from_tree( entity: CoreEntity) -> Optional[CoreEntity]: """Recursively prunes dangling placeholders from the entity tree, returning the pruned entity tree, or None if the tree itself is a dangling placeholder. The returned tree will have no subtrees where all the nodes are placeholders. """ has_non_placeholder_children = False for field_name in get_set_entity_field_names( entity, EntityFieldType.FORWARD_EDGE): children_to_keep = [] for child in entity.get_field_as_list(field_name): pruned_tree = prune_dangling_placeholders_from_tree(child) if pruned_tree: children_to_keep.append(pruned_tree) has_non_placeholder_children = True entity.set_field_from_list(field_name, children_to_keep) if has_non_placeholder_children or not is_placeholder(entity): return entity return None
def __init__(self, database_entity: CoreEntity, ingested_entities: Sequence[CoreEntity]): msg_template = ( "Matched one database entity to multiple ingested entities." "\nDatabase entity db id: {}" "\nIngested entities: {}") msg = msg_template.format(database_entity.get_id(), '\n'.join(str(e) for e in ingested_entities)) super().__init__(msg, database_entity.get_entity_name())
def is_placeholder(entity: CoreEntity) -> bool: """Determines if the provided entity is a placeholder. Conceptually, a placeholder is an object that we have no information about, but have inferred its existence based on other objects we do have information about. Generally, an entity is a placeholder if all of the optional flat fields are empty or set to a default value. """ # Although these are not flat fields, they represent characteristics of a # person. If present, we do have information about the provided person, and # therefore it is not a placeholder. if isinstance(entity, (schema.StatePerson, entities.StatePerson)): if any([ entity.external_ids, entity.races, entity.aliases, entity.ethnicities ]): return False set_flat_fields = get_set_entity_field_names(entity, EntityFieldType.FLAT_FIELD) primary_key_name = entity.get_primary_key_column_name() if primary_key_name in set_flat_fields: set_flat_fields.remove(primary_key_name) # TODO(2244): Change this to a general approach so we don't need to check # explicit columns if 'state_code' in set_flat_fields: set_flat_fields.remove('state_code') if 'status' in set_flat_fields: if entity.has_default_status(): set_flat_fields.remove('status') if 'incarceration_type' in set_flat_fields: if entity.has_default_enum('incarceration_type', StateIncarcerationType.STATE_PRISON): set_flat_fields.remove('incarceration_type') if 'court_type' in set_flat_fields: if entity.has_default_enum('court_type', StateCourtType.PRESENT_WITHOUT_INFO): set_flat_fields.remove('court_type') if 'agent_type' in set_flat_fields: if entity.has_default_enum('agent_type', StateAgentType.PRESENT_WITHOUT_INFO): set_flat_fields.remove('agent_type') return not bool(set_flat_fields)
def __init__(self, ingested_entity: CoreEntity, database_entities: Sequence[CoreEntity]): msg_template = ( "Matched one ingested entity to multiple database entities." "\nIngested entity: {}" "\nDatabase entity db ids: {}") msg = msg_template.format( ingested_entity, '\n'.join(str(e.get_id()) for e in database_entities)) super(MatchedMultipleDatabaseEntitiesError, self).__init__(msg, ingested_entity.get_entity_name())
def get_set_entity_field_names(entity: CoreEntity, entity_field_type: EntityFieldType) -> Set[str]: result = set() for field_name in get_all_core_entity_field_names(entity, entity_field_type): v = entity.get_field(field_name) if isinstance(v, list): if v: result.add(field_name) elif v is not None: result.add(field_name) return result
def get_all_core_entity_field_names( entity: CoreEntity, entity_field_type: EntityFieldType) -> Set[str]: """Returns a set of field_names that correspond to any set fields on the provided |entity| that match the provided |entity_field_type|. """ if entity.get_entity_name().startswith('state_'): direction_checker = SchemaEdgeDirectionChecker.state_direction_checker( ) else: direction_checker = \ SchemaEdgeDirectionChecker.county_direction_checker() if isinstance(entity, DatabaseEntity): return _get_all_database_entity_field_names(entity, entity_field_type, direction_checker) if isinstance(entity, Entity): return _get_all_entity_field_names(entity, entity_field_type, direction_checker) raise ValueError(f"Invalid entity type [{type(entity)}]")
def print_entity_tree(entity: CoreEntity, print_tree_structure_only: bool = False, indent: int = 0, python_id_to_fake_id: Dict[int, int] = None): """Recursively prints out all objects in the tree below the given entity. Each time we encounter a new object, we assign a new fake id (an auto-incrementing count) and print that with the object. This means that two entity trees with the exact same shape/flat fields will print out the exact same string, making it much easier to debug edge-related issues in Diffchecker, etc. Note: this function sorts any list fields in the provided entity IN PLACE (should not matter for any equality checks we generally do). """ if python_id_to_fake_id is None: python_id_to_fake_id = {} _sort_based_on_flat_fields([entity]) _print_indented(_obj_id_str(entity, python_id_to_fake_id), indent) indent = indent + 2 for field in get_all_core_entity_field_names(entity, EntityFieldType.FLAT_FIELD): if field == 'external_id' or not print_tree_structure_only: val = entity.get_field(field) _print_indented(f'{field}: {str(val)}', indent) for child_field in get_all_core_entity_field_names(entity, EntityFieldType.FORWARD_EDGE): child = entity.get_field(child_field) if child is not None: if isinstance(child, list): if not child: _print_indented(f'{child_field}: []', indent) else: _print_indented(f'{child_field}: [', indent) for c in child: print_entity_tree(c, print_tree_structure_only, indent + 2, python_id_to_fake_id) _print_indented(']', indent) else: _print_indented(f'{child_field}:', indent) print_entity_tree(child, print_tree_structure_only, indent + 2, python_id_to_fake_id) else: _print_indented(f'{child_field}: None', indent) for child_field in get_all_core_entity_field_names(entity, EntityFieldType.BACK_EDGE): child = entity.get_field(child_field) if child: if isinstance(child, list): first_child = next(iter(child)) unique = {id(c) for c in child} len_str = f'{len(child)}' if len(unique) == len(child) else f'{len(child)} - ONLY {len(unique)} UNIQUE!' id_str = _obj_id_str(first_child, python_id_to_fake_id) ellipsis_str = ', ...' if len(child) > 1 else '' _print_indented(f'{child_field} ({len_str}): [{id_str}{ellipsis_str}] - backedge', indent) else: id_str = _obj_id_str(child, python_id_to_fake_id) _print_indented(f'{child_field}: {id_str} - backedge', indent) else: _print_indented(f'{child_field}: None - backedge', indent)
def _get_flat_fields_json_str(entity: CoreEntity): flat_fields_dict: Dict[str, str] = {} for field_name in get_set_entity_field_names( entity, EntityFieldType.FLAT_FIELD): flat_fields_dict[field_name] = str(entity.get_field(field_name)) return json.dumps(flat_fields_dict, sort_keys=True)