def test_write_preexisting_person(self): # Arrange most_recent_scrape_time = (SCRAPER_START_DATETIME + timedelta(days=1)) metadata = IngestMetadata.new_with_defaults( region=REGION_1, jurisdiction_id=JURISDICTION_ID, ingest_time=most_recent_scrape_time) schema_booking = schema.Booking( booking_id=BOOKING_ID, external_id=EXTERNAL_BOOKING_ID, admission_date_inferred=True, custody_status=CustodyStatus.IN_CUSTODY.value, last_seen_time=SCRAPER_START_DATETIME, first_seen_time=SCRAPER_START_DATETIME) schema_person = schema.Person( person_id=PERSON_ID, jurisdiction_id=JURISDICTION_ID, external_id=EXTERNAL_PERSON_ID, region=REGION_1, bookings=[schema_booking]) session = SessionFactory.for_schema_base(JailsBase) session.add(schema_person) session.commit() ingest_info = IngestInfo() ingest_info.people.add(full_name=FULL_NAME_1, person_id=EXTERNAL_PERSON_ID, booking_ids=[EXTERNAL_BOOKING_ID]) ingest_info.bookings.add( booking_id=EXTERNAL_BOOKING_ID, custody_status='IN CUSTODY', ) # Act persistence.write(ingest_info, metadata) # Assert expected_booking = county_entities.Booking.new_with_defaults( booking_id=BOOKING_ID, external_id=EXTERNAL_BOOKING_ID, admission_date_inferred=True, custody_status=CustodyStatus.IN_CUSTODY, custody_status_raw_text=BOOKING_CUSTODY_STATUS.upper(), last_seen_time=most_recent_scrape_time, first_seen_time=SCRAPER_START_DATETIME) expected_person = county_entities.Person.new_with_defaults( person_id=PERSON_ID, external_id=EXTERNAL_PERSON_ID, region=REGION_1, jurisdiction_id=JURISDICTION_ID, bookings=[expected_booking]) self.assertEqual([expected_person], county_dao.read_people( SessionFactory.for_schema_base(JailsBase)))
def write_record() -> Tuple[str, HTTPStatus]: ingest_info = None last_scraped_time = None region = None jurisdiction_id = None with monitoring.push_tags({monitoring.TagKey.REGION: region}): metadata = IngestMetadata(region, jurisdiction_id, last_scraped_time) # type: ignore persistence.write(ingest_info, metadata) # type: ignore return "", HTTPStatus.NOT_IMPLEMENTED
def test_localRun(self): with patch('os.getenv', Mock(return_value='Local')): # Arrange ingest_info = IngestInfo() ingest_info.people.add(full_name=FULL_NAME_1) # Act persistence.write(ingest_info, DEFAULT_METADATA) result = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) # Assert assert not result
def write_record(): # TODO: Something like `ingest_info = protobuf.read(request.data)` ingest_info = None last_scraped_time = None region = None jurisdiction_id = None with monitoring.push_tags({monitoring.TagKey.REGION: region}): metadata = IngestMetadata(region, jurisdiction_id, last_scraped_time) persistence.write(ingest_info, metadata) return '', HTTPStatus.NOT_IMPLEMENTED
def test_persistLocally(self): # Arrange with patch('os.getenv', Mock(return_value='local')) \ and patch.dict('os.environ', {'PERSIST_LOCALLY': 'true'}): ingest_info = IngestInfo() ingest_info.people.add(full_name=FULL_NAME_1) # Act persistence.write(ingest_info, DEFAULT_METADATA) result = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) # Assert assert len(result) == 1 assert result[0].full_name == _format_full_name(FULL_NAME_1)
def _parse_and_persist_contents(self, args: IngestArgsType, contents_handle: ContentsHandleType): """ Runs the full ingest process for this controller for files with non-empty contents. """ ingest_info = self._parse(args, contents_handle) if not ingest_info: raise DirectIngestError( error_type=DirectIngestErrorType.PARSE_ERROR, msg="No IngestInfo after parse.") logging.info("Successfully parsed data for ingest run [%s]", self._job_tag(args)) ingest_info_proto = \ ingest_utils.convert_ingest_info_to_proto(ingest_info) logging.info( "Successfully converted ingest_info to proto for ingest " "run [%s]", self._job_tag(args)) ingest_metadata = self._get_ingest_metadata(args) persist_success = persistence.write(ingest_info_proto, ingest_metadata) if not persist_success: raise DirectIngestError( error_type=DirectIngestErrorType.PERSISTENCE_ERROR, msg="Persist step failed") logging.info("Successfully persisted for ingest run [%s]", self._job_tag(args))
def test_twoDifferentPeople_persistsBoth(self): # Arrange ingest_info = IngestInfo() ingest_info.people.add(person_id='1_GENERATE', full_name=FULL_NAME_1) ingest_info.people.add(person_id='2_GENERATE', full_name=FULL_NAME_2) # Act persistence.write(ingest_info, DEFAULT_METADATA) result = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) # Assert assert len(result) == 2 assert result[0].full_name == _format_full_name(FULL_NAME_2) assert result[1].full_name == _format_full_name(FULL_NAME_1)
def test_write_noPeople(self): # Arrange most_recent_scrape_time = (SCRAPER_START_DATETIME + timedelta(days=1)) metadata = IngestMetadata.new_with_defaults( region=REGION_1, jurisdiction_id=JURISDICTION_ID, ingest_time=most_recent_scrape_time) ingest_info = IngestInfo() # Act persistence.write(ingest_info, metadata) # Assert people = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) self.assertFalse(people)
def test_nonRetryableError_failsImmediately(self, mock_commit, mock_close): # Arrange ingest_info = IngestInfo() ingest_info.people.add(full_name=FULL_NAME_1) inner_error = create_autospec(psycopg2.OperationalError) # Not Null Violation is not retryable inner_error.pgcode = NOT_NULL_VIOLATION error = sqlalchemy.exc.DatabaseError(statement=None, params=None, orig=inner_error) mock_commit.side_effect = [error, mock.DEFAULT] # Act / Assert with pytest.raises(sqlalchemy.exc.DatabaseError): persistence.write(ingest_info, DEFAULT_METADATA) # Assert assert mock_commit.call_args_list == [call()] mock_close.assert_called_once()
def test_retryableError_retries(self, mock_commit, mock_close): # Arrange ingest_info = IngestInfo() ingest_info.people.add(full_name=FULL_NAME_1) inner_error = create_autospec(psycopg2.OperationalError) # Serialization Failure is retryable inner_error.pgcode = SERIALIZATION_FAILURE error = sqlalchemy.exc.DatabaseError(statement=None, params=None, orig=inner_error) # 5 retries is allowed mock_commit.side_effect = [error] * 5 + [mock.DEFAULT] # Act persistence.write(ingest_info, DEFAULT_METADATA) # Assert assert mock_commit.call_args_list == [call()] * 6 mock_close.assert_called_once()
def test_multipleOpenBookings_raisesPersistenceError(self): ingest_info = ii.IngestInfo() person = ingest_info.create_person(full_name=FULL_NAME_1) person.create_booking(admission_date=DATE_RAW) person.create_booking(admission_date=DATE_RAW) self.assertFalse( persistence.write(convert_ingest_info_to_proto(ingest_info), DEFAULT_METADATA))
def test_readSinglePersonByName(self): # Arrange ingest_info = IngestInfo() ingest_info.people.add(person_id='1_GENERATE', full_name=FULL_NAME_1, birthdate=BIRTHDATE_1) ingest_info.people.add(person_id='2_GENERATE', full_name=FULL_NAME_2, birthdate=BIRTHDATE_2) # Act persistence.write(ingest_info, DEFAULT_METADATA) result = county_dao.read_people( SessionFactory.for_schema_base(JailsBase), full_name=_format_full_name(FULL_NAME_1)) # Assert assert len(result) == 1 assert result[0].full_name == _format_full_name(FULL_NAME_1) assert result[0].birthdate == BIRTHDATE_1_DATE
def test_twoDifferentPeople_persistsNone(self): # Arrange ingest_info = IngestInfo() ingest_info.people.add(person_id='1', full_name=FULL_NAME_1) ingest_info.people.add(person_id='2', full_name=FULL_NAME_2, gender='X') # Act self.assertFalse(persistence.write(ingest_info, DEFAULT_METADATA)) result = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) # Assert assert not result
def test_threeDifferentPeople_persistsTwoBelowThreshold(self): # Arrange ingest_info = IngestInfo() ingest_info.people.add(person_id='1_GENERATE', full_name=FULL_NAME_2) ingest_info.people.add(person_id='2_GENERATE', full_name=FULL_NAME_3) ingest_info.people.add(person_id=EXTERNAL_PERSON_ID, full_name=FULL_NAME_1, booking_ids=[EXTERNAL_BOOKING_ID]) ingest_info.bookings.add( booking_id=EXTERNAL_BOOKING_ID, custody_status='NO EXIST', ) # Act persistence.write(ingest_info, DEFAULT_METADATA) result = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) # Assert assert len(result) == 2 assert result[0].full_name == _format_full_name(FULL_NAME_3) assert result[1].full_name == _format_full_name(FULL_NAME_2)
def persist_to_database( region_code: str, session_start_time: datetime.datetime ) -> bool: """Reads all of the ingest infos from Datastore for a region and persists them to the database. """ region = regions.get_region(region_code) overrides = region.get_scraper_enum_overrides() ingest_info_data_list = _get_batch_ingest_info_list(region_code, session_start_time) logging.info("Received %s total ingest infos", len(ingest_info_data_list)) if ingest_info_data_list: proto, failed_tasks = _get_proto_from_batch_ingest_info_data_list( ingest_info_data_list ) if not proto.people: logging.error("Scrape session returned 0 people.") return False for batch_ingest_info_datum in failed_tasks.values(): logging.error( "Task with trace_id %s failed with error %s", batch_ingest_info_datum.trace_id, batch_ingest_info_datum.error, ) if _should_abort(len(failed_tasks), len(proto.people)): logging.error( "Too many scraper tasks failed(%s), aborting write", len(failed_tasks) ) return False metadata = IngestMetadata( region=region_code, jurisdiction_id=region.jurisdiction_id, ingest_time=session_start_time, facility_id=region.facility_id, enum_overrides=overrides, system_level=SystemLevel.COUNTY, database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS), ) did_write = persistence.write(proto, metadata) if did_write: datastore_ingest_info.batch_delete_ingest_infos_for_region(region_code) return did_write logging.error("No ingest infos received from Datastore") return False
def test_twoDifferentPeopleWithBooking_persistsNone(self): # Arrange ingest_info = IngestInfo() ingest_info.people.add(full_name=FULL_NAME_2) ingest_info.people.add(full_name=FULL_NAME_1, person_id=EXTERNAL_PERSON_ID, booking_ids=[EXTERNAL_BOOKING_ID]) ingest_info.bookings.add( booking_id=EXTERNAL_BOOKING_ID, custody_status='NO EXIST', ) # Act self.assertFalse(persistence.write(ingest_info, DEFAULT_METADATA)) result = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) # Assert assert not result
def _parse_and_persist_contents(self, args: IngestArgsType, contents: ContentsType): """ Runs the full ingest process for this controller for files with non-empty contents. """ ingest_info = self._parse(args, contents) # TODO(1738): implement retry on fail. if not ingest_info: raise DirectIngestError( error_type=DirectIngestErrorType.PARSE_ERROR, msg="No IngestInfo after parse.") logging.info("Successfully parsed data for ingest run [%s]", self._job_tag(args)) ingest_info_proto = \ ingest_utils.convert_ingest_info_to_proto(ingest_info) logging.info( "Successfully converted ingest_info to proto for ingest " "run [%s]", self._job_tag(args)) ingest_metadata = IngestMetadata(self.region.region_code, self.region.jurisdiction_id, args.ingest_time, self.get_enum_overrides(), self.system_level) persist_success = persistence.write(ingest_info_proto, ingest_metadata) if not persist_success: raise DirectIngestError( error_type=DirectIngestErrorType.PERSISTENCE_ERROR, msg="Persist step failed") logging.info("Successfully persisted for ingest run [%s]", self._job_tag(args))
def _generic_scrape(self, request: QueueRequest): """ General handler for all scrape tasks. This function is a generic entry point into all types of scrapes. It decides what to call based on params. Args: params: dict of parameters passed from the last scrape session. """ try: task = request.next_task # Here we handle a special case where we weren't really sure # we were going to get data when we submitted a task, but then # we ended up with data, so no more requests are required, # just the content we already have. # TODO(#680): remove this if task.content is not None: content = self._parse_html_content(task.content) cookies = None else: post_data = task.post_data # Let the child transform the post_data if it wants before # sending the requests. This hook is in here in case the # child did something like compress the post_data before # it put it on the queue. self.transform_post_data(post_data) # We always fetch some content before doing anything. # Note that we use get here for the post_data to return a # default value of None if this scraper doesn't set it. try: content, cookies = self._fetch_content( task.endpoint, task.response_type, headers=task.headers, cookies=task.cookies, params=task.params, post_data=post_data, json_data=task.json) except Exception as e: raise ScraperFetchError(str(e)) from e scraped_data = None if self.should_scrape_data(task.task_type): # If we want to scrape data, we should either create an # ingest_info object or get the one that already exists. logging.info("Scraping data for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) try: scraped_data = self.populate_data( content, task, request.ingest_info or IngestInfo()) except Exception as e: raise ScraperPopulateDataError(str(e)) from e if self.should_get_more_tasks(task.task_type): logging.info("Getting more tasks for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) # Only send along ingest info if it will not be persisted now. ingest_info_to_send = None if scraped_data is not None and not scraped_data.persist: ingest_info_to_send = scraped_data.ingest_info try: # pylint: disable=assignment-from-no-return next_tasks = self.get_more_tasks(content, task) except Exception as e: raise ScraperGetMoreTasksError(str(e)) from e for next_task in next_tasks: # Include cookies received from response, if any if cookies: cookies.update(next_task.cookies) next_task = Task.evolve(next_task, cookies=cookies) self.add_task( '_generic_scrape', QueueRequest( scrape_type=request.scrape_type, scraper_start_time=request.scraper_start_time, next_task=next_task, ingest_info=ingest_info_to_send, )) if scraped_data is not None and scraped_data.persist: if scraped_data.ingest_info: logging.info("Logging at most 4 people (were %d):", len(scraped_data.ingest_info.people)) loop_count = min(len(scraped_data.ingest_info.people), constants.MAX_PEOPLE_TO_LOG) for i in range(loop_count): logging.info("[%s]", str(scraped_data.ingest_info.people[i])) logging.info("Last seen time of person being set as: [%s]", request.scraper_start_time) metadata = IngestMetadata(self.region.region_code, self.region.jurisdiction_id, request.scraper_start_time, self.get_enum_overrides()) if self.BATCH_WRITES: logging.info( "Queuing ingest_info ([%d] people) to " "batch_persistence for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write( ingest_info=scraped_data.ingest_info, scrape_key=scrape_key, task=task, ) else: logging.info( "Writing ingest_info ([%d] people) to the database" " for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) persistence.write( ingest_utils.convert_ingest_info_to_proto( scraped_data.ingest_info), metadata) for sc in scraped_data.single_counts: if not sc.date: scrape_key = ScrapeKey(self.region.region_code, constants.ScrapeType.BACKGROUND) session = sessions.get_current_session(scrape_key) if session: sc = attr.evolve(sc, date=session.start.date()) single_count.store_single_count( sc, self.region.jurisdiction_id) except Exception as e: if self.BATCH_WRITES: scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write_error( error=str(e), trace_id=get_trace_id_from_flask(), task=task, scrape_key=scrape_key, ) raise e
def test_state_threeSentenceGroups_dontPersistAboveThreshold(self): # Arrange ingest_info = IngestInfo() ingest_info.state_people.add( state_person_id='1_GENERATE', state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2]) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_2, county_code=COUNTY_CODE) db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1) db_sentence_group = schema.StateSentenceGroup( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID, state_code=REGION_CODE) db_sentence_group_2 = schema.StateSentenceGroup( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=REGION_CODE) db_external_id = schema.StatePersonExternalId(person_external_id_id=ID, state_code=REGION_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE) db_person.sentence_groups = [db_sentence_group, db_sentence_group_2] db_person.external_ids = [db_external_id] db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1) db_sentence_group_2_dup = schema.StateSentenceGroup( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=REGION_CODE) db_external_id_2 = schema.StatePersonExternalId( person_external_id_id=ID_2, state_code=REGION_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE) db_person_2.sentence_groups = [db_sentence_group_2_dup] db_person_2.external_ids = [db_external_id_2] # No updates expected_person = self.to_entity(db_person) expected_person_2 = self.to_entity(db_person_2) session = SessionFactory.for_schema_base(StateBase) session.add(db_person) session.add(db_person_2) session.commit() # Act persistence.write(ingest_info, DEFAULT_METADATA) session = SessionFactory.for_schema_base(StateBase) persons = dao.read_people(session) # Assert self.assertEqual([expected_person, expected_person_2], converter.convert_schema_objects_to_entity(persons))
def test_state_threeSentenceGroups_persistsTwoBelowThreshold(self): # Arrange ingest_info = IngestInfo() ingest_info.state_people.add(state_person_id='1_GENERATE', state_sentence_group_ids=[ SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2, SENTENCE_GROUP_ID_3 ]) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_2, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_3, county_code=COUNTY_CODE) db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1) db_sentence_group = schema.StateSentenceGroup( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID, state_code=REGION_CODE) db_sentence_group_2 = schema.StateSentenceGroup( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=REGION_CODE) db_sentence_group_3 = schema.StateSentenceGroup( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_3, state_code=REGION_CODE) db_external_id = schema.StatePersonExternalId(person_external_id_id=ID, state_code=REGION_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE) db_person.sentence_groups = [ db_sentence_group, db_sentence_group_2, db_sentence_group_3 ] db_person.external_ids = [db_external_id] db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1) db_sentence_group_3_dup = schema.StateSentenceGroup( sentence_group_id=ID_4, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_3, state_code=REGION_CODE) db_external_id_2 = schema.StatePersonExternalId( person_external_id_id=ID_2, state_code=REGION_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE) db_person_2.sentence_groups = [db_sentence_group_3_dup] db_person_2.external_ids = [db_external_id_2] expected_person = StatePerson.new_with_defaults(person_id=ID, full_name=FULL_NAME_1, external_ids=[], sentence_groups=[]) expected_external_id = StatePersonExternalId.new_with_defaults( person_external_id_id=ID, state_code=REGION_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE, person=expected_person) expected_sentence_group = StateSentenceGroup.new_with_defaults( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID, state_code=REGION_CODE, county_code=COUNTY_CODE, person=expected_person) expected_sentence_group_2 = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_2, state_code=REGION_CODE, county_code=COUNTY_CODE, person=expected_person) # No county code because errors during match expected_sentence_group_3 = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_3, state_code=REGION_CODE, person=expected_person) expected_person.external_ids = [expected_external_id] expected_person.sentence_groups = [ expected_sentence_group, expected_sentence_group_2, expected_sentence_group_3 ] expected_person_2 = StatePerson.new_with_defaults( person_id=ID_2, full_name=FULL_NAME_1) expected_external_id_2 = StatePersonExternalId.new_with_defaults( person_external_id_id=ID_2, state_code=REGION_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE, person=expected_person_2) # No county code because unmatched expected_sentence_group_3_dup = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_4, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_3, state_code=REGION_CODE, person=expected_person_2) expected_person_2.sentence_groups = [expected_sentence_group_3_dup] expected_person_2.external_ids = [expected_external_id_2] session = SessionFactory.for_schema_base(StateBase) session.add(db_person) session.add(db_person_2) session.commit() # Act persistence.write(ingest_info, DEFAULT_METADATA) session = SessionFactory.for_schema_base(StateBase) persons = dao.read_people(session) # Assert self.assertEqual([expected_person, expected_person_2], converter.convert_schema_objects_to_entity(persons))
def test_state_threeSentenceGroups_dontPersistAboveThreshold( self, mock_get_matcher): # Arrange mock_get_matcher.return_value = _PatchedStateEntityMatcher( region_code=STATE_CODE, erroring_class=schema.StateSentenceGroup, erroring_external_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_4], ) # Arrange ingest_info = IngestInfo() ingest_info.state_people.add( state_person_id="1_GENERATE", state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2], ) ingest_info.state_people.add( state_person_id="2_GENERATE", state_sentence_group_ids=[ SENTENCE_GROUP_ID_3, SENTENCE_GROUP_ID_4 ], ) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_2, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_3, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_4, county_code=COUNTY_CODE) db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1, state_code=STATE_CODE) db_sentence_group = schema.StateSentenceGroup( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID, state_code=STATE_CODE, ) db_sentence_group_2 = schema.StateSentenceGroup( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=STATE_CODE, ) db_external_id = schema.StatePersonExternalId( person_external_id_id=ID, state_code=STATE_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE, ) db_person.sentence_groups = [db_sentence_group, db_sentence_group_2] db_person.external_ids = [db_external_id] db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1, state_code=STATE_CODE) db_sentence_group_3 = schema.StateSentenceGroup( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_3, state_code=STATE_CODE, ) db_external_id_2 = schema.StatePersonExternalId( person_external_id_id=ID_2, state_code=STATE_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE, ) db_person_2.external_ids = [db_external_id_2] db_person_2.sentence_groups = [db_sentence_group_3] # No updates expected_person = self.to_entity(db_person) expected_person_2 = self.to_entity(db_person_2) session = SessionFactory.for_schema_base(StateBase) session.add(db_person) session.add(db_person_2) session.commit() # Act persistence.write(ingest_info, DEFAULT_METADATA) session = SessionFactory.for_schema_base(StateBase) persons = dao.read_people(session) # Assert self.assertEqual( [expected_person, expected_person_2], converter.convert_schema_objects_to_entity(persons), )
def test_readPersonAndAllRelationships(self): # Arrange metadata = IngestMetadata.new_with_defaults( region=REGION_1, jurisdiction_id=JURISDICTION_ID, ingest_time=SCRAPER_START_DATETIME) ingest_info = IngestInfo() ingest_info.people.add(full_name=FULL_NAME_1, booking_ids=['BOOKING_ID']) ingest_info.bookings.add(booking_id='BOOKING_ID', facility=FACILITY, custody_status=BOOKING_CUSTODY_STATUS, arrest_id='ARREST_ID', charge_ids=['CHARGE_ID_1', 'CHARGE_ID_2']) ingest_info.arrests.add(arrest_id='ARREST_ID', officer_name=OFFICER_NAME) ingest_info.bonds.add(bond_id='SHARED_BOND_ID', bond_type=BOND_TYPE, status=BOND_STATUS) ingest_info.charges.extend([ Charge(charge_id='CHARGE_ID_1', name=CHARGE_NAME_1, status=CHARGE_STATUS, bond_id='SHARED_BOND_ID', sentence_id='SENTENCE_ID_1'), Charge(charge_id='CHARGE_ID_2', name=CHARGE_NAME_2, status=CHARGE_STATUS, bond_id='SHARED_BOND_ID', sentence_id='SENTENCE_ID_2') ]) ingest_info.sentences.extend([ Sentence(sentence_id='SENTENCE_ID_1', fine_dollars=FINE_1, status=SENTENCE_STATUS), Sentence(sentence_id='SENTENCE_ID_2', fine_dollars=FINE_2, status=SENTENCE_STATUS) ]) # Act persistence.write(ingest_info, metadata) result = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) # Assert assert len(result) == 1 result_person = result[0] assert result_person.full_name == _format_full_name(FULL_NAME_1) assert len(result_person.bookings) == 1 result_booking = result_person.bookings[0] assert result_booking.facility == FACILITY assert result_booking.last_seen_time == SCRAPER_START_DATETIME result_arrest = result_booking.arrest assert result_arrest.officer_name == OFFICER_NAME result_charges = result_booking.charges assert len(result_charges) == 2 assert result_charges[0].name == CHARGE_NAME_1 assert result_charges[1].name == CHARGE_NAME_2 bond_1 = result_charges[0].bond bond_2 = result_charges[1].bond assert bond_1.bond_type == bond_2.bond_type sentence_1 = result_charges[0].sentence sentence_2 = result_charges[1].sentence assert sentence_1.fine_dollars == FINE_1_INT assert sentence_2.fine_dollars == FINE_2_INT
def test_state_threeSentenceGroups_persistsTwoBelowThreshold( self, mock_get_matcher): """Ensure that the number of errors is below the ND specific threshold""" mock_get_matcher.return_value = _PatchedStateEntityMatcher( region_code=STATE_CODE, erroring_class=schema.StateSentenceGroup, erroring_external_ids=[SENTENCE_GROUP_ID], ) # Set the ENTITY_MATCHING_THRESHOLD to 0, such that we can verify that the forty percent threshold for # ENTITY_MATCHING_THRESHOLD is dictated by the state-specific override in # STATE_CODE_TO_ENTITY_MATCHING_THRESHOLD_FORTY_PERCENT. STATE_ERROR_THRESHOLDS_WITH_FORTY_PERCENT_RATIOS[ ENTITY_MATCHING_THRESHOLD] = 0 # Arrange ingest_info = IngestInfo() ingest_info.state_people.add( state_person_id="1_GENERATE", state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2], ) ingest_info.state_people.add( state_person_id="2_GENERATE", state_sentence_group_ids=[SENTENCE_GROUP_ID_3]) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_2, county_code=COUNTY_CODE) ingest_info.state_sentence_groups.add( state_sentence_group_id=SENTENCE_GROUP_ID_3, county_code=COUNTY_CODE) db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1, state_code=STATE_CODE) db_sentence_group = schema.StateSentenceGroup( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID, state_code=STATE_CODE, ) db_sentence_group_2 = schema.StateSentenceGroup( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_2, state_code=STATE_CODE, ) db_external_id = schema.StatePersonExternalId( person_external_id_id=ID, state_code=STATE_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE, ) db_person.sentence_groups = [db_sentence_group, db_sentence_group_2] db_person.external_ids = [db_external_id] db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1, state_code=STATE_CODE) db_sentence_group_3 = schema.StateSentenceGroup( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN.value, external_id=SENTENCE_GROUP_ID_3, state_code=STATE_CODE, ) db_external_id_2 = schema.StatePersonExternalId( person_external_id_id=ID_2, state_code=STATE_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE, ) db_person_2.external_ids = [db_external_id_2] db_person_2.sentence_groups = [db_sentence_group_3] session = SessionFactory.for_schema_base(StateBase) session.add(db_person) session.add(db_person_2) session.commit() expected_person = StatePerson.new_with_defaults( person_id=ID, full_name=FULL_NAME_1, external_ids=[], sentence_groups=[], state_code=STATE_CODE, ) expected_external_id = StatePersonExternalId.new_with_defaults( person_external_id_id=ID, state_code=STATE_CODE, external_id=EXTERNAL_ID, id_type=ID_TYPE, person=expected_person, ) # No county code because errors during match expected_sentence_group = StateSentenceGroup.new_with_defaults( sentence_group_id=ID, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID, state_code=STATE_CODE, person=expected_person, ) expected_sentence_group_2 = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_2, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_2, state_code=STATE_CODE, county_code=COUNTY_CODE, person=expected_person, ) expected_person.external_ids = [expected_external_id] expected_person.sentence_groups = [ expected_sentence_group, expected_sentence_group_2, ] expected_person_2 = StatePerson.new_with_defaults( person_id=ID_2, full_name=FULL_NAME_1, state_code=STATE_CODE) expected_external_id_2 = StatePersonExternalId.new_with_defaults( person_external_id_id=ID_2, state_code=STATE_CODE, external_id=EXTERNAL_ID_2, id_type=ID_TYPE, person=expected_person_2, ) expected_sentence_group_3 = StateSentenceGroup.new_with_defaults( sentence_group_id=ID_3, status=StateSentenceStatus.EXTERNAL_UNKNOWN, external_id=SENTENCE_GROUP_ID_3, state_code=STATE_CODE, county_code=COUNTY_CODE, person=expected_person_2, ) expected_person_2.sentence_groups = [expected_sentence_group_3] expected_person_2.external_ids = [expected_external_id_2] # Act persistence.write(ingest_info, DEFAULT_METADATA) session = SessionFactory.for_schema_base(StateBase) persons = dao.read_people(session) # Assert self.assertEqual( [expected_person, expected_person_2], converter.convert_schema_objects_to_entity(persons), )