def test_readPeople(self) -> None: # Arrange person = schema.StatePerson( person_id=8, full_name=_FULL_NAME, birthdate=_BIRTHDATE, state_code=_STATE_CODE, ) person_different_name = schema.StatePerson(person_id=9, full_name="diff_name", state_code=_STATE_CODE) person_different_birthdate = schema.StatePerson( state_code=_STATE_CODE, person_id=10, birthdate=datetime.date(year=2002, month=1, day=2), ) session = SessionFactory.for_schema_base(StateBase) session.add(person) session.add(person_different_name) session.add(person_different_birthdate) session.commit() # Act people = dao.read_people(session, full_name=None, birthdate=None) # Assert expected_people = [ person, person_different_name, person_different_birthdate ] self.assertCountEqual(people, expected_people)
def test_readPeople_byBirthdate(self) -> None: # Arrange person = schema.StatePerson(person_id=8, birthdate=_BIRTHDATE, state_code=_STATE_CODE) person_different_birthdate = schema.StatePerson( state_code=_STATE_CODE, person_id=9, birthdate=datetime.date(year=2002, month=1, day=2), ) with SessionFactory.using_database(self.database_key, autocommit=False) as session: session.add(person) session.add(person_different_birthdate) session.commit() # Act people = dao.read_people(session, full_name=None, birthdate=_BIRTHDATE) # Assert expected_people = [person] self.assertCountEqual(people, expected_people)
def test_readPeople(self): # Arrange person = schema.StatePerson(person_id=8, full_name=_FULL_NAME, birthdate=_BIRTHDATE) person_different_name = schema.StatePerson(person_id=9, full_name='diff_name') person_different_birthdate = schema.StatePerson( person_id=10, birthdate=datetime.date(year=2002, month=1, day=2)) session = SessionFactory.for_schema_base(StateBase) session.add(person) session.add(person_different_name) session.add(person_different_birthdate) session.commit() # Act people = dao.read_people(session, full_name=None, birthdate=None) # Assert expected_people = [ converter.convert_schema_object_to_entity(person), converter.convert_schema_object_to_entity(person_different_name), converter.convert_schema_object_to_entity( person_different_birthdate) ] self.assertCountEqual(people, expected_people)
def read_persons( session: Session, region: str, ingested_people: List[schema.StatePerson]) -> List[schema.StatePerson]: """Looks up all people necessary for entity matching based on the provided |region| and |ingested_people|. """ check_all_objs_have_type(ingested_people, schema.StatePerson) # TODO(1868): more specific query db_people = dao.read_people(session) logging.info("Read [%d] people from DB in region [%s]", len(db_people), region) return db_people
def test_readPeople_byBirthdate(self): # Arrange person = schema.StatePerson(person_id=8, birthdate=_BIRTHDATE) person_different_birthdate = schema.StatePerson( person_id=9, birthdate=datetime.date(year=2002, month=1, day=2)) session = SessionFactory.for_schema_base(StateBase) session.add(person) session.add(person_different_birthdate) session.commit() # Act people = dao.read_people(session, full_name=None, birthdate=_BIRTHDATE) # Assert expected_people = [person] self.assertCountEqual(people, expected_people)
def test_readPeople_byFullName(self): # Arrange person = schema.StatePerson(person_id=8, full_name=_FULL_NAME) person_different_name = schema.StatePerson(person_id=9, full_name='diff_name') session = SessionFactory.for_schema_base(StateBase) session.add(person) session.add(person_different_name) session.commit() # Act people = dao.read_people(session, full_name=_FULL_NAME, birthdate=None) # Assert expected_people = [person] self.assertCountEqual(people, expected_people)
def assert_people_match_pre_and_post_commit( self, expected_people, matched_people, match_session, expected_unmatched_db_people=None, debug=False): self._assert_people_match(expected_people, matched_people, debug) # Sanity check that committing and reading the people from the DB # doesn't break/update any fields (except for DB ids). match_session.commit() match_session.close() session = self._session() result_db_people = dao.read_people(session) if expected_unmatched_db_people: expected_people.extend(expected_unmatched_db_people) self._assert_people_match( expected_people, result_db_people) assert_no_unexpected_entities_in_db(result_db_people, session)
def read_persons(session: Session, region: str, ingested_people: List[StatePerson]) -> List[StatePerson]: """Looks up all people necessary for entity matching based on the provided |region| and |ingested_people|. """ if region.upper() == 'US_ND': db_people = _nd_read_people(session, region, ingested_people) else: # TODO(1868): more specific query # Do not populate back edges before entity matching. All entities in the # state schema have edges both to their children and their parents. We # remove these for simplicity as entity matching does not depend on # these parent references (back edges). Back edges are regenerated # as a part of the conversion process from Entity -> Schema object. # If we did not remove these back edges, any time an entity relationship # changes, we would have to update edges both on the parent and child, # instead of just on the parent. db_people = dao.read_people(session, populate_back_edges=False) logging.info("Read [%d] people from DB in region [%s]", len(db_people), region) return db_people
def test_readPeople_byFullName(self) -> None: # Arrange person = schema.StatePerson(person_id=8, full_name=_FULL_NAME, state_code=_STATE_CODE) person_different_name = schema.StatePerson(person_id=9, full_name="diff_name", state_code=_STATE_CODE) with SessionFactory.using_database(self.database_key, autocommit=False) as session: session.add(person) session.add(person_different_name) session.commit() # Act people = dao.read_people(session, full_name=_FULL_NAME, birthdate=None) # Assert expected_people = [person] self.assertCountEqual(people, expected_people)
def assert_expected_db_people( self, expected_db_people: List[StatePerson], debug: bool = False, single_person_to_debug: Optional[str] = None, # TODO(#2492): Once we properly clean up dangling placeholders, # delete this. ignore_dangling_placeholders: bool = False, print_tree_structure_only: bool = False, ) -> None: """Asserts that the set of expected people matches all the people that currently exist in the database. Args: debug: (bool) If true, prints out both the found and expected entity trees. single_person_to_debug: (str) A string external_id of a person. If debug=True and this is not None, this will only check for equality between the people with that external_id. This should be used for debugging only and this function will throw if this value is set in CI. ignore_dangling_placeholders: (bool) If True, eliminates dangling placeholder objects (i.e. placeholders with no non-placeholder children) from both the result and expected trees before doing a comparison. print_tree_structure_only: (bool) If True and debug=True, then the printed result only shows the tree structure - external ids and parent-child relationships. """ if debug: print("\n\n************** ASSERTING *************") session = SessionFactory.for_schema_base(StateBase) found_people_from_db = dao.read_people(session) found_people = cast( List[StatePerson], self.convert_and_clear_db_ids(found_people_from_db)) if ignore_dangling_placeholders: pruned_found_people = [] for person in found_people: pruned_person = cast( StatePerson, prune_dangling_placeholders_from_tree(person)) if pruned_person is not None: pruned_found_people.append(pruned_person) found_people = pruned_found_people pruned_expected_people: List[StatePerson] = [] for person in expected_db_people: pruned_expected_person = cast( StatePerson, prune_dangling_placeholders_from_tree(person)) if pruned_expected_person is not None: pruned_expected_people.append(pruned_expected_person) expected_db_people = pruned_expected_people if debug: if is_running_in_ci(): self.fail( "The |debug| flag should only be used for local debugging." ) if single_person_to_debug is not None: found_people = [ p for p in found_people if person_has_id(p, single_person_to_debug) ] expected_db_people = [ p for p in expected_db_people if person_has_id(p, single_person_to_debug) ] print_visible_header_label("FINAL") print_entity_trees( found_people, print_tree_structure_only=print_tree_structure_only) print_visible_header_label("EXPECTED") print_entity_trees( expected_db_people, print_tree_structure_only=print_tree_structure_only) self.assertCountEqual(found_people, expected_db_people) assert_no_unexpected_entities_in_db(found_people_from_db, session)
def test_state_threeSentenceGroups_dontPersistAboveThreshold(self): # Arrange ingest_info = IngestInfo() ingest_info.state_people.add( state_person_id='1_GENERATE', state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2]) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_2, county_code=COUNTY_CODE) db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1) db_sentence_group = schema.StateSentenceGroup( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID, state_code=REGION_CODE) db_sentence_group_2 = schema.StateSentenceGroup( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=REGION_CODE) db_external_id = schema.StatePersonExternalId(person_external_id_id=ID, state_code=REGION_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE) db_person.sentence_groups = [db_sentence_group, db_sentence_group_2] db_person.external_ids = [db_external_id] db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1) db_sentence_group_2_dup = schema.StateSentenceGroup( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=REGION_CODE) db_external_id_2 = schema.StatePersonExternalId( person_external_id_id=ID_2, state_code=REGION_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE) db_person_2.sentence_groups = [db_sentence_group_2_dup] db_person_2.external_ids = [db_external_id_2] # No updates expected_person = self.to_entity(db_person) expected_person_2 = self.to_entity(db_person_2) session = SessionFactory.for_schema_base(StateBase) session.add(db_person) session.add(db_person_2) session.commit() # Act persistence.write(ingest_info, DEFAULT_METADATA) session = SessionFactory.for_schema_base(StateBase) persons = dao.read_people(session) # Assert self.assertEqual([expected_person, expected_person_2], converter.convert_schema_objects_to_entity(persons))
def test_state_threeSentenceGroups_persistsTwoBelowThreshold(self): # Arrange ingest_info = IngestInfo() ingest_info.state_people.add(state_person_id='1_GENERATE', state_sentence_group_ids=[ SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2, SENTENCE_GROUP_ID_3 ]) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_2, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_3, county_code=COUNTY_CODE) db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1) db_sentence_group = schema.StateSentenceGroup( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID, state_code=REGION_CODE) db_sentence_group_2 = schema.StateSentenceGroup( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=REGION_CODE) db_sentence_group_3 = schema.StateSentenceGroup( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_3, state_code=REGION_CODE) db_external_id = schema.StatePersonExternalId(person_external_id_id=ID, state_code=REGION_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE) db_person.sentence_groups = [ db_sentence_group, db_sentence_group_2, db_sentence_group_3 ] db_person.external_ids = [db_external_id] db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1) db_sentence_group_3_dup = schema.StateSentenceGroup( sentence_group_id=ID_4, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_3, state_code=REGION_CODE) db_external_id_2 = schema.StatePersonExternalId( person_external_id_id=ID_2, state_code=REGION_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE) db_person_2.sentence_groups = [db_sentence_group_3_dup] db_person_2.external_ids = [db_external_id_2] expected_person = StatePerson.new_with_defaults(person_id=ID, full_name=FULL_NAME_1, external_ids=[], sentence_groups=[]) expected_external_id = StatePersonExternalId.new_with_defaults( person_external_id_id=ID, state_code=REGION_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE, person=expected_person) expected_sentence_group = StateSentenceGroup.new_with_defaults( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID, state_code=REGION_CODE, county_code=COUNTY_CODE, person=expected_person) expected_sentence_group_2 = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_2, state_code=REGION_CODE, county_code=COUNTY_CODE, person=expected_person) # No county code because errors during match expected_sentence_group_3 = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_3, state_code=REGION_CODE, person=expected_person) expected_person.external_ids = [expected_external_id] expected_person.sentence_groups = [ expected_sentence_group, expected_sentence_group_2, expected_sentence_group_3 ] expected_person_2 = StatePerson.new_with_defaults( person_id=ID_2, full_name=FULL_NAME_1) expected_external_id_2 = StatePersonExternalId.new_with_defaults( person_external_id_id=ID_2, state_code=REGION_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE, person=expected_person_2) # No county code because unmatched expected_sentence_group_3_dup = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_4, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_3, state_code=REGION_CODE, person=expected_person_2) expected_person_2.sentence_groups = [expected_sentence_group_3_dup] expected_person_2.external_ids = [expected_external_id_2] session = SessionFactory.for_schema_base(StateBase) session.add(db_person) session.add(db_person_2) session.commit() # Act persistence.write(ingest_info, DEFAULT_METADATA) session = SessionFactory.for_schema_base(StateBase) persons = dao.read_people(session) # Assert self.assertEqual([expected_person, expected_person_2], converter.convert_schema_objects_to_entity(persons))
def test_state_threeSentenceGroups_persistsTwoBelowThreshold( self, mock_get_matcher): """Ensure that the number of errors is below the ND specific threshold""" mock_get_matcher.return_value = _PatchedStateEntityMatcher( region_code=STATE_CODE, erroring_class=schema.StateSentenceGroup, erroring_external_ids=[SENTENCE_GROUP_ID], ) # Set the ENTITY_MATCHING_THRESHOLD to 0, such that we can verify that the forty percent threshold for # ENTITY_MATCHING_THRESHOLD is dictated by the state-specific override in # STATE_CODE_TO_ENTITY_MATCHING_THRESHOLD_FORTY_PERCENT. STATE_ERROR_THRESHOLDS_WITH_FORTY_PERCENT_RATIOS[ ENTITY_MATCHING_THRESHOLD] = 0 # Arrange ingest_info = IngestInfo() ingest_info.state_people.add( state_person_id="1_GENERATE", state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2], ) ingest_info.state_people.add( state_person_id="2_GENERATE", state_sentence_group_ids=[SENTENCE_GROUP_ID_3]) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_2, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_3, county_code=COUNTY_CODE) db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1, state_code=STATE_CODE) db_sentence_group = schema.StateSentenceGroup( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID, state_code=STATE_CODE, ) db_sentence_group_2 = schema.StateSentenceGroup( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=STATE_CODE, ) db_external_id = schema.StatePersonExternalId( person_external_id_id=ID, state_code=STATE_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE, ) db_person.sentence_groups = [db_sentence_group, db_sentence_group_2] db_person.external_ids = [db_external_id] db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1, state_code=STATE_CODE) db_sentence_group_3 = schema.StateSentenceGroup( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_3, state_code=STATE_CODE, ) db_external_id_2 = schema.StatePersonExternalId( person_external_id_id=ID_2, state_code=STATE_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE, ) db_person_2.external_ids = [db_external_id_2] db_person_2.sentence_groups = [db_sentence_group_3] session = SessionFactory.for_schema_base(StateBase) session.add(db_person) session.add(db_person_2) session.commit() expected_person = StatePerson.new_with_defaults( person_id=ID, full_name=FULL_NAME_1, external_ids=[], sentence_groups=[], state_code=STATE_CODE, ) expected_external_id = StatePersonExternalId.new_with_defaults( person_external_id_id=ID, state_code=STATE_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE, person=expected_person, ) # No county code because errors during match expected_sentence_group = StateSentenceGroup.new_with_defaults( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID, state_code=STATE_CODE, person=expected_person, ) expected_sentence_group_2 = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_2, state_code=STATE_CODE, county_code=COUNTY_CODE, person=expected_person, ) expected_person.external_ids = [expected_external_id] expected_person.sentence_groups = [ expected_sentence_group, expected_sentence_group_2, ] expected_person_2 = StatePerson.new_with_defaults( person_id=ID_2, full_name=FULL_NAME_1, state_code=STATE_CODE) expected_external_id_2 = StatePersonExternalId.new_with_defaults( person_external_id_id=ID_2, state_code=STATE_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE, person=expected_person_2, ) expected_sentence_group_3 = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_3, state_code=STATE_CODE, county_code=COUNTY_CODE, person=expected_person_2, ) expected_person_2.sentence_groups = [expected_sentence_group_3] expected_person_2.external_ids = [expected_external_id_2] # Act persistence.write(ingest_info, DEFAULT_METADATA) session = SessionFactory.for_schema_base(StateBase) persons = dao.read_people(session) # Assert self.assertEqual( [expected_person, expected_person_2], converter.convert_schema_objects_to_entity(persons), )
def test_state_threeSentenceGroups_dontPersistAboveThreshold( self, mock_get_matcher): # Arrange mock_get_matcher.return_value = _PatchedStateEntityMatcher( region_code=STATE_CODE, erroring_class=schema.StateSentenceGroup, erroring_external_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_4], ) # Arrange ingest_info = IngestInfo() ingest_info.state_people.add( state_person_id="1_GENERATE", state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2], ) ingest_info.state_people.add( state_person_id="2_GENERATE", state_sentence_group_ids=[ SENTENCE_GROUP_ID_3, SENTENCE_GROUP_ID_4 ], ) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_2, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_3, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_4, county_code=COUNTY_CODE) db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1, state_code=STATE_CODE) db_sentence_group = schema.StateSentenceGroup( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID, state_code=STATE_CODE, ) db_sentence_group_2 = schema.StateSentenceGroup( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=STATE_CODE, ) db_external_id = schema.StatePersonExternalId( person_external_id_id=ID, state_code=STATE_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE, ) db_person.sentence_groups = [db_sentence_group, db_sentence_group_2] db_person.external_ids = [db_external_id] db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1, state_code=STATE_CODE) db_sentence_group_3 = schema.StateSentenceGroup( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_3, state_code=STATE_CODE, ) db_external_id_2 = schema.StatePersonExternalId( person_external_id_id=ID_2, state_code=STATE_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE, ) db_person_2.external_ids = [db_external_id_2] db_person_2.sentence_groups = [db_sentence_group_3] # No updates expected_person = self.to_entity(db_person) expected_person_2 = self.to_entity(db_person_2) session = SessionFactory.for_schema_base(StateBase) session.add(db_person) session.add(db_person_2) session.commit() # Act persistence.write(ingest_info, DEFAULT_METADATA) session = SessionFactory.for_schema_base(StateBase) persons = dao.read_people(session) # Assert self.assertEqual( [expected_person, expected_person_2], converter.convert_schema_objects_to_entity(persons), )