예제 #1
0
    def test_write_preexisting_person(self):
        # Arrange
        most_recent_scrape_time = (SCRAPER_START_DATETIME + timedelta(days=1))
        metadata = IngestMetadata.new_with_defaults(
            region=REGION_1,
            jurisdiction_id=JURISDICTION_ID,
            ingest_time=most_recent_scrape_time)

        schema_booking = schema.Booking(
            booking_id=BOOKING_ID,
            external_id=EXTERNAL_BOOKING_ID,
            admission_date_inferred=True,
            custody_status=CustodyStatus.IN_CUSTODY.value,
            last_seen_time=SCRAPER_START_DATETIME,
            first_seen_time=SCRAPER_START_DATETIME)
        schema_person = schema.Person(
            person_id=PERSON_ID,
            jurisdiction_id=JURISDICTION_ID,
            external_id=EXTERNAL_PERSON_ID,
            region=REGION_1,
            bookings=[schema_booking])

        session = SessionFactory.for_schema_base(JailsBase)
        session.add(schema_person)
        session.commit()

        ingest_info = IngestInfo()
        ingest_info.people.add(full_name=FULL_NAME_1,
                               person_id=EXTERNAL_PERSON_ID,
                               booking_ids=[EXTERNAL_BOOKING_ID])
        ingest_info.bookings.add(
            booking_id=EXTERNAL_BOOKING_ID,
            custody_status='IN CUSTODY',
        )

        # Act
        persistence.write(ingest_info, metadata)

        # Assert
        expected_booking = county_entities.Booking.new_with_defaults(
            booking_id=BOOKING_ID,
            external_id=EXTERNAL_BOOKING_ID,
            admission_date_inferred=True,
            custody_status=CustodyStatus.IN_CUSTODY,
            custody_status_raw_text=BOOKING_CUSTODY_STATUS.upper(),
            last_seen_time=most_recent_scrape_time,
            first_seen_time=SCRAPER_START_DATETIME)
        expected_person = county_entities.Person.new_with_defaults(
            person_id=PERSON_ID,
            external_id=EXTERNAL_PERSON_ID,
            region=REGION_1,
            jurisdiction_id=JURISDICTION_ID,
            bookings=[expected_booking])
        self.assertEqual([expected_person],
                         county_dao.read_people(
                             SessionFactory.for_schema_base(JailsBase)))
예제 #2
0
def write_record() -> Tuple[str, HTTPStatus]:
    ingest_info = None
    last_scraped_time = None
    region = None
    jurisdiction_id = None

    with monitoring.push_tags({monitoring.TagKey.REGION: region}):
        metadata = IngestMetadata(region, jurisdiction_id, last_scraped_time)  # type: ignore

        persistence.write(ingest_info, metadata)  # type: ignore

        return "", HTTPStatus.NOT_IMPLEMENTED
예제 #3
0
    def test_localRun(self):
        with patch('os.getenv', Mock(return_value='Local')):
            # Arrange
            ingest_info = IngestInfo()
            ingest_info.people.add(full_name=FULL_NAME_1)

            # Act
            persistence.write(ingest_info, DEFAULT_METADATA)
            result = county_dao.read_people(
                SessionFactory.for_schema_base(JailsBase))

            # Assert
            assert not result
예제 #4
0
def write_record():
    # TODO: Something like `ingest_info = protobuf.read(request.data)`
    ingest_info = None
    last_scraped_time = None
    region = None
    jurisdiction_id = None

    with monitoring.push_tags({monitoring.TagKey.REGION: region}):
        metadata = IngestMetadata(region, jurisdiction_id, last_scraped_time)

        persistence.write(ingest_info, metadata)

        return '', HTTPStatus.NOT_IMPLEMENTED
예제 #5
0
    def test_persistLocally(self):
        # Arrange
        with patch('os.getenv', Mock(return_value='local')) \
             and patch.dict('os.environ', {'PERSIST_LOCALLY': 'true'}):
            ingest_info = IngestInfo()
            ingest_info.people.add(full_name=FULL_NAME_1)

            # Act
            persistence.write(ingest_info, DEFAULT_METADATA)
            result = county_dao.read_people(
                SessionFactory.for_schema_base(JailsBase))

            # Assert
            assert len(result) == 1
            assert result[0].full_name == _format_full_name(FULL_NAME_1)
예제 #6
0
    def _parse_and_persist_contents(self, args: IngestArgsType,
                                    contents_handle: ContentsHandleType):
        """
        Runs the full ingest process for this controller for files with
        non-empty contents.
        """
        ingest_info = self._parse(args, contents_handle)
        if not ingest_info:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PARSE_ERROR,
                msg="No IngestInfo after parse.")

        logging.info("Successfully parsed data for ingest run [%s]",
                     self._job_tag(args))

        ingest_info_proto = \
            ingest_utils.convert_ingest_info_to_proto(ingest_info)

        logging.info(
            "Successfully converted ingest_info to proto for ingest "
            "run [%s]", self._job_tag(args))

        ingest_metadata = self._get_ingest_metadata(args)
        persist_success = persistence.write(ingest_info_proto, ingest_metadata)

        if not persist_success:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PERSISTENCE_ERROR,
                msg="Persist step failed")

        logging.info("Successfully persisted for ingest run [%s]",
                     self._job_tag(args))
예제 #7
0
    def test_twoDifferentPeople_persistsBoth(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.add(person_id='1_GENERATE', full_name=FULL_NAME_1)
        ingest_info.people.add(person_id='2_GENERATE', full_name=FULL_NAME_2)

        # Act
        persistence.write(ingest_info, DEFAULT_METADATA)
        result = county_dao.read_people(
            SessionFactory.for_schema_base(JailsBase))

        # Assert
        assert len(result) == 2

        assert result[0].full_name == _format_full_name(FULL_NAME_2)
        assert result[1].full_name == _format_full_name(FULL_NAME_1)
예제 #8
0
    def test_write_noPeople(self):
        # Arrange
        most_recent_scrape_time = (SCRAPER_START_DATETIME + timedelta(days=1))
        metadata = IngestMetadata.new_with_defaults(
            region=REGION_1,
            jurisdiction_id=JURISDICTION_ID,
            ingest_time=most_recent_scrape_time)

        ingest_info = IngestInfo()

        # Act
        persistence.write(ingest_info, metadata)

        # Assert
        people = county_dao.read_people(
            SessionFactory.for_schema_base(JailsBase))
        self.assertFalse(people)
예제 #9
0
    def test_nonRetryableError_failsImmediately(self, mock_commit, mock_close):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.add(full_name=FULL_NAME_1)

        inner_error = create_autospec(psycopg2.OperationalError)
        # Not Null Violation is not retryable
        inner_error.pgcode = NOT_NULL_VIOLATION
        error = sqlalchemy.exc.DatabaseError(statement=None, params=None, orig=inner_error)
        mock_commit.side_effect = [error, mock.DEFAULT]

        # Act / Assert
        with pytest.raises(sqlalchemy.exc.DatabaseError):
            persistence.write(ingest_info, DEFAULT_METADATA)

        # Assert
        assert mock_commit.call_args_list == [call()]
        mock_close.assert_called_once()
예제 #10
0
    def test_retryableError_retries(self, mock_commit, mock_close):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.add(full_name=FULL_NAME_1)

        inner_error = create_autospec(psycopg2.OperationalError)
        # Serialization Failure is retryable
        inner_error.pgcode = SERIALIZATION_FAILURE
        error = sqlalchemy.exc.DatabaseError(statement=None, params=None, orig=inner_error)
        # 5 retries is allowed
        mock_commit.side_effect = [error] * 5 + [mock.DEFAULT]

        # Act
        persistence.write(ingest_info, DEFAULT_METADATA)

        # Assert
        assert mock_commit.call_args_list == [call()] * 6
        mock_close.assert_called_once()
예제 #11
0
    def test_multipleOpenBookings_raisesPersistenceError(self):
        ingest_info = ii.IngestInfo()
        person = ingest_info.create_person(full_name=FULL_NAME_1)
        person.create_booking(admission_date=DATE_RAW)
        person.create_booking(admission_date=DATE_RAW)

        self.assertFalse(
            persistence.write(convert_ingest_info_to_proto(ingest_info),
                              DEFAULT_METADATA))
예제 #12
0
    def test_readSinglePersonByName(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.add(person_id='1_GENERATE',
                               full_name=FULL_NAME_1,
                               birthdate=BIRTHDATE_1)
        ingest_info.people.add(person_id='2_GENERATE',
                               full_name=FULL_NAME_2,
                               birthdate=BIRTHDATE_2)

        # Act
        persistence.write(ingest_info, DEFAULT_METADATA)
        result = county_dao.read_people(
            SessionFactory.for_schema_base(JailsBase),
            full_name=_format_full_name(FULL_NAME_1))

        # Assert
        assert len(result) == 1
        assert result[0].full_name == _format_full_name(FULL_NAME_1)
        assert result[0].birthdate == BIRTHDATE_1_DATE
예제 #13
0
    def test_twoDifferentPeople_persistsNone(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.add(person_id='1', full_name=FULL_NAME_1)
        ingest_info.people.add(person_id='2', full_name=FULL_NAME_2, gender='X')

        # Act
        self.assertFalse(persistence.write(ingest_info, DEFAULT_METADATA))
        result = county_dao.read_people(
            SessionFactory.for_schema_base(JailsBase))

        # Assert
        assert not result
예제 #14
0
    def test_threeDifferentPeople_persistsTwoBelowThreshold(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.add(person_id='1_GENERATE', full_name=FULL_NAME_2)
        ingest_info.people.add(person_id='2_GENERATE', full_name=FULL_NAME_3)
        ingest_info.people.add(person_id=EXTERNAL_PERSON_ID,
                               full_name=FULL_NAME_1,
                               booking_ids=[EXTERNAL_BOOKING_ID])
        ingest_info.bookings.add(
            booking_id=EXTERNAL_BOOKING_ID,
            custody_status='NO EXIST',
        )

        # Act
        persistence.write(ingest_info, DEFAULT_METADATA)
        result = county_dao.read_people(
            SessionFactory.for_schema_base(JailsBase))

        # Assert
        assert len(result) == 2
        assert result[0].full_name == _format_full_name(FULL_NAME_3)
        assert result[1].full_name == _format_full_name(FULL_NAME_2)
예제 #15
0
def persist_to_database(
    region_code: str, session_start_time: datetime.datetime
) -> bool:
    """Reads all of the ingest infos from Datastore for a region and persists
    them to the database.
    """
    region = regions.get_region(region_code)
    overrides = region.get_scraper_enum_overrides()

    ingest_info_data_list = _get_batch_ingest_info_list(region_code, session_start_time)

    logging.info("Received %s total ingest infos", len(ingest_info_data_list))
    if ingest_info_data_list:
        proto, failed_tasks = _get_proto_from_batch_ingest_info_data_list(
            ingest_info_data_list
        )

        if not proto.people:
            logging.error("Scrape session returned 0 people.")
            return False

        for batch_ingest_info_datum in failed_tasks.values():
            logging.error(
                "Task with trace_id %s failed with error %s",
                batch_ingest_info_datum.trace_id,
                batch_ingest_info_datum.error,
            )
        if _should_abort(len(failed_tasks), len(proto.people)):
            logging.error(
                "Too many scraper tasks failed(%s), aborting write", len(failed_tasks)
            )
            return False

        metadata = IngestMetadata(
            region=region_code,
            jurisdiction_id=region.jurisdiction_id,
            ingest_time=session_start_time,
            facility_id=region.facility_id,
            enum_overrides=overrides,
            system_level=SystemLevel.COUNTY,
            database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS),
        )

        did_write = persistence.write(proto, metadata)
        if did_write:
            datastore_ingest_info.batch_delete_ingest_infos_for_region(region_code)

        return did_write

    logging.error("No ingest infos received from Datastore")
    return False
예제 #16
0
    def test_twoDifferentPeopleWithBooking_persistsNone(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.add(full_name=FULL_NAME_2)
        ingest_info.people.add(full_name=FULL_NAME_1,
                               person_id=EXTERNAL_PERSON_ID,
                               booking_ids=[EXTERNAL_BOOKING_ID])
        ingest_info.bookings.add(
            booking_id=EXTERNAL_BOOKING_ID,
            custody_status='NO EXIST',
        )

        # Act
        self.assertFalse(persistence.write(ingest_info, DEFAULT_METADATA))
        result = county_dao.read_people(
            SessionFactory.for_schema_base(JailsBase))

        # Assert
        assert not result
    def _parse_and_persist_contents(self, args: IngestArgsType,
                                    contents: ContentsType):
        """
        Runs the full ingest process for this controller for files with
        non-empty contents.
        """
        ingest_info = self._parse(args, contents)
        # TODO(1738): implement retry on fail.
        if not ingest_info:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PARSE_ERROR,
                msg="No IngestInfo after parse.")

        logging.info("Successfully parsed data for ingest run [%s]",
                     self._job_tag(args))

        ingest_info_proto = \
            ingest_utils.convert_ingest_info_to_proto(ingest_info)

        logging.info(
            "Successfully converted ingest_info to proto for ingest "
            "run [%s]", self._job_tag(args))

        ingest_metadata = IngestMetadata(self.region.region_code,
                                         self.region.jurisdiction_id,
                                         args.ingest_time,
                                         self.get_enum_overrides(),
                                         self.system_level)
        persist_success = persistence.write(ingest_info_proto, ingest_metadata)

        if not persist_success:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PERSISTENCE_ERROR,
                msg="Persist step failed")

        logging.info("Successfully persisted for ingest run [%s]",
                     self._job_tag(args))
예제 #18
0
    def _generic_scrape(self, request: QueueRequest):
        """
        General handler for all scrape tasks.  This function is a generic entry
        point into all types of scrapes.  It decides what to call based on
        params.

        Args:
            params: dict of parameters passed from the last scrape session.
        """
        try:
            task = request.next_task

            # Here we handle a special case where we weren't really sure
            # we were going to get data when we submitted a task, but then
            # we ended up with data, so no more requests are required,
            # just the content we already have.
            # TODO(#680): remove this
            if task.content is not None:
                content = self._parse_html_content(task.content)
                cookies = None
            else:
                post_data = task.post_data

                # Let the child transform the post_data if it wants before
                # sending the requests.  This hook is in here in case the
                # child did something like compress the post_data before
                # it put it on the queue.
                self.transform_post_data(post_data)

                # We always fetch some content before doing anything.
                # Note that we use get here for the post_data to return a
                # default value of None if this scraper doesn't set it.
                try:
                    content, cookies = self._fetch_content(
                        task.endpoint,
                        task.response_type,
                        headers=task.headers,
                        cookies=task.cookies,
                        params=task.params,
                        post_data=post_data,
                        json_data=task.json)
                except Exception as e:
                    raise ScraperFetchError(str(e)) from e

            scraped_data = None
            if self.should_scrape_data(task.task_type):
                # If we want to scrape data, we should either create an
                # ingest_info object or get the one that already exists.
                logging.info("Scraping data for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)
                try:
                    scraped_data = self.populate_data(
                        content, task, request.ingest_info or IngestInfo())
                except Exception as e:
                    raise ScraperPopulateDataError(str(e)) from e

            if self.should_get_more_tasks(task.task_type):
                logging.info("Getting more tasks for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)

                # Only send along ingest info if it will not be persisted now.
                ingest_info_to_send = None
                if scraped_data is not None and not scraped_data.persist:
                    ingest_info_to_send = scraped_data.ingest_info

                try:
                    # pylint: disable=assignment-from-no-return
                    next_tasks = self.get_more_tasks(content, task)
                except Exception as e:
                    raise ScraperGetMoreTasksError(str(e)) from e
                for next_task in next_tasks:
                    # Include cookies received from response, if any
                    if cookies:
                        cookies.update(next_task.cookies)
                        next_task = Task.evolve(next_task, cookies=cookies)
                    self.add_task(
                        '_generic_scrape',
                        QueueRequest(
                            scrape_type=request.scrape_type,
                            scraper_start_time=request.scraper_start_time,
                            next_task=next_task,
                            ingest_info=ingest_info_to_send,
                        ))

            if scraped_data is not None and scraped_data.persist:
                if scraped_data.ingest_info:
                    logging.info("Logging at most 4 people (were %d):",
                                 len(scraped_data.ingest_info.people))
                    loop_count = min(len(scraped_data.ingest_info.people),
                                     constants.MAX_PEOPLE_TO_LOG)
                    for i in range(loop_count):
                        logging.info("[%s]",
                                     str(scraped_data.ingest_info.people[i]))
                    logging.info("Last seen time of person being set as: [%s]",
                                 request.scraper_start_time)
                    metadata = IngestMetadata(self.region.region_code,
                                              self.region.jurisdiction_id,
                                              request.scraper_start_time,
                                              self.get_enum_overrides())
                    if self.BATCH_WRITES:
                        logging.info(
                            "Queuing ingest_info ([%d] people) to "
                            "batch_persistence for [%s]",
                            len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        scrape_key = ScrapeKey(self.region.region_code,
                                               request.scrape_type)
                        batch_persistence.write(
                            ingest_info=scraped_data.ingest_info,
                            scrape_key=scrape_key,
                            task=task,
                        )
                    else:
                        logging.info(
                            "Writing ingest_info ([%d] people) to the database"
                            " for [%s]", len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        persistence.write(
                            ingest_utils.convert_ingest_info_to_proto(
                                scraped_data.ingest_info), metadata)
                for sc in scraped_data.single_counts:
                    if not sc.date:
                        scrape_key = ScrapeKey(self.region.region_code,
                                               constants.ScrapeType.BACKGROUND)
                        session = sessions.get_current_session(scrape_key)
                        if session:
                            sc = attr.evolve(sc, date=session.start.date())
                    single_count.store_single_count(
                        sc, self.region.jurisdiction_id)
        except Exception as e:
            if self.BATCH_WRITES:
                scrape_key = ScrapeKey(self.region.region_code,
                                       request.scrape_type)
                batch_persistence.write_error(
                    error=str(e),
                    trace_id=get_trace_id_from_flask(),
                    task=task,
                    scrape_key=scrape_key,
                )
            raise e
    def test_state_threeSentenceGroups_dontPersistAboveThreshold(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.state_people.add(
            state_person_id='1_GENERATE',
            state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2])
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE)
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID_2,
            county_code=COUNTY_CODE)

        db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1)
        db_sentence_group = schema.StateSentenceGroup(
            sentence_group_id=ID,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID,
            state_code=REGION_CODE)
        db_sentence_group_2 = schema.StateSentenceGroup(
            sentence_group_id=ID_2,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_2,
            state_code=REGION_CODE)
        db_external_id = schema.StatePersonExternalId(person_external_id_id=ID,
                                                      state_code=REGION_CODE,
                                                      external_id=EXTERNAL_ID,
                                                      id_type=ID_TYPE)
        db_person.sentence_groups = [db_sentence_group, db_sentence_group_2]
        db_person.external_ids = [db_external_id]

        db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1)
        db_sentence_group_2_dup = schema.StateSentenceGroup(
            sentence_group_id=ID_3,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_2,
            state_code=REGION_CODE)
        db_external_id_2 = schema.StatePersonExternalId(
            person_external_id_id=ID_2,
            state_code=REGION_CODE,
            external_id=EXTERNAL_ID_2,
            id_type=ID_TYPE)
        db_person_2.sentence_groups = [db_sentence_group_2_dup]
        db_person_2.external_ids = [db_external_id_2]

        # No updates
        expected_person = self.to_entity(db_person)
        expected_person_2 = self.to_entity(db_person_2)

        session = SessionFactory.for_schema_base(StateBase)
        session.add(db_person)
        session.add(db_person_2)
        session.commit()

        # Act
        persistence.write(ingest_info, DEFAULT_METADATA)
        session = SessionFactory.for_schema_base(StateBase)
        persons = dao.read_people(session)

        # Assert
        self.assertEqual([expected_person, expected_person_2],
                         converter.convert_schema_objects_to_entity(persons))
    def test_state_threeSentenceGroups_persistsTwoBelowThreshold(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.state_people.add(state_person_id='1_GENERATE',
                                     state_sentence_group_ids=[
                                         SENTENCE_GROUP_ID,
                                         SENTENCE_GROUP_ID_2,
                                         SENTENCE_GROUP_ID_3
                                     ])
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE)
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID_2,
            county_code=COUNTY_CODE)
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID_3,
            county_code=COUNTY_CODE)

        db_person = schema.StatePerson(person_id=ID, full_name=FULL_NAME_1)
        db_sentence_group = schema.StateSentenceGroup(
            sentence_group_id=ID,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID,
            state_code=REGION_CODE)
        db_sentence_group_2 = schema.StateSentenceGroup(
            sentence_group_id=ID_2,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_2,
            state_code=REGION_CODE)
        db_sentence_group_3 = schema.StateSentenceGroup(
            sentence_group_id=ID_3,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_3,
            state_code=REGION_CODE)
        db_external_id = schema.StatePersonExternalId(person_external_id_id=ID,
                                                      state_code=REGION_CODE,
                                                      external_id=EXTERNAL_ID,
                                                      id_type=ID_TYPE)
        db_person.sentence_groups = [
            db_sentence_group, db_sentence_group_2, db_sentence_group_3
        ]
        db_person.external_ids = [db_external_id]

        db_person_2 = schema.StatePerson(person_id=ID_2, full_name=FULL_NAME_1)
        db_sentence_group_3_dup = schema.StateSentenceGroup(
            sentence_group_id=ID_4,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_3,
            state_code=REGION_CODE)
        db_external_id_2 = schema.StatePersonExternalId(
            person_external_id_id=ID_2,
            state_code=REGION_CODE,
            external_id=EXTERNAL_ID_2,
            id_type=ID_TYPE)
        db_person_2.sentence_groups = [db_sentence_group_3_dup]
        db_person_2.external_ids = [db_external_id_2]

        expected_person = StatePerson.new_with_defaults(person_id=ID,
                                                        full_name=FULL_NAME_1,
                                                        external_ids=[],
                                                        sentence_groups=[])
        expected_external_id = StatePersonExternalId.new_with_defaults(
            person_external_id_id=ID,
            state_code=REGION_CODE,
            external_id=EXTERNAL_ID,
            id_type=ID_TYPE,
            person=expected_person)
        expected_sentence_group = StateSentenceGroup.new_with_defaults(
            sentence_group_id=ID,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN,
            external_id=SENTENCE_GROUP_ID,
            state_code=REGION_CODE,
            county_code=COUNTY_CODE,
            person=expected_person)
        expected_sentence_group_2 = StateSentenceGroup.new_with_defaults(
            sentence_group_id=ID_2,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN,
            external_id=SENTENCE_GROUP_ID_2,
            state_code=REGION_CODE,
            county_code=COUNTY_CODE,
            person=expected_person)
        # No county code because errors during match
        expected_sentence_group_3 = StateSentenceGroup.new_with_defaults(
            sentence_group_id=ID_3,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN,
            external_id=SENTENCE_GROUP_ID_3,
            state_code=REGION_CODE,
            person=expected_person)
        expected_person.external_ids = [expected_external_id]
        expected_person.sentence_groups = [
            expected_sentence_group, expected_sentence_group_2,
            expected_sentence_group_3
        ]

        expected_person_2 = StatePerson.new_with_defaults(
            person_id=ID_2, full_name=FULL_NAME_1)
        expected_external_id_2 = StatePersonExternalId.new_with_defaults(
            person_external_id_id=ID_2,
            state_code=REGION_CODE,
            external_id=EXTERNAL_ID_2,
            id_type=ID_TYPE,
            person=expected_person_2)
        # No county code because unmatched
        expected_sentence_group_3_dup = StateSentenceGroup.new_with_defaults(
            sentence_group_id=ID_4,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN,
            external_id=SENTENCE_GROUP_ID_3,
            state_code=REGION_CODE,
            person=expected_person_2)
        expected_person_2.sentence_groups = [expected_sentence_group_3_dup]
        expected_person_2.external_ids = [expected_external_id_2]

        session = SessionFactory.for_schema_base(StateBase)
        session.add(db_person)
        session.add(db_person_2)
        session.commit()

        # Act
        persistence.write(ingest_info, DEFAULT_METADATA)
        session = SessionFactory.for_schema_base(StateBase)
        persons = dao.read_people(session)

        # Assert
        self.assertEqual([expected_person, expected_person_2],
                         converter.convert_schema_objects_to_entity(persons))
예제 #21
0
    def test_state_threeSentenceGroups_dontPersistAboveThreshold(
            self, mock_get_matcher):
        # Arrange
        mock_get_matcher.return_value = _PatchedStateEntityMatcher(
            region_code=STATE_CODE,
            erroring_class=schema.StateSentenceGroup,
            erroring_external_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_4],
        )

        # Arrange
        ingest_info = IngestInfo()
        ingest_info.state_people.add(
            state_person_id="1_GENERATE",
            state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2],
        )
        ingest_info.state_people.add(
            state_person_id="2_GENERATE",
            state_sentence_group_ids=[
                SENTENCE_GROUP_ID_3, SENTENCE_GROUP_ID_4
            ],
        )
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE)
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID_2,
            county_code=COUNTY_CODE)
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID_3,
            county_code=COUNTY_CODE)
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID_4,
            county_code=COUNTY_CODE)

        db_person = schema.StatePerson(person_id=ID,
                                       full_name=FULL_NAME_1,
                                       state_code=STATE_CODE)
        db_sentence_group = schema.StateSentenceGroup(
            sentence_group_id=ID,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID,
            state_code=STATE_CODE,
        )
        db_sentence_group_2 = schema.StateSentenceGroup(
            sentence_group_id=ID_2,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_2,
            state_code=STATE_CODE,
        )
        db_external_id = schema.StatePersonExternalId(
            person_external_id_id=ID,
            state_code=STATE_CODE,
            external_id=EXTERNAL_ID,
            id_type=ID_TYPE,
        )
        db_person.sentence_groups = [db_sentence_group, db_sentence_group_2]
        db_person.external_ids = [db_external_id]

        db_person_2 = schema.StatePerson(person_id=ID_2,
                                         full_name=FULL_NAME_1,
                                         state_code=STATE_CODE)
        db_sentence_group_3 = schema.StateSentenceGroup(
            sentence_group_id=ID_3,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_3,
            state_code=STATE_CODE,
        )
        db_external_id_2 = schema.StatePersonExternalId(
            person_external_id_id=ID_2,
            state_code=STATE_CODE,
            external_id=EXTERNAL_ID_2,
            id_type=ID_TYPE,
        )
        db_person_2.external_ids = [db_external_id_2]
        db_person_2.sentence_groups = [db_sentence_group_3]

        # No updates
        expected_person = self.to_entity(db_person)
        expected_person_2 = self.to_entity(db_person_2)

        session = SessionFactory.for_schema_base(StateBase)
        session.add(db_person)
        session.add(db_person_2)
        session.commit()

        # Act
        persistence.write(ingest_info, DEFAULT_METADATA)
        session = SessionFactory.for_schema_base(StateBase)
        persons = dao.read_people(session)

        # Assert
        self.assertEqual(
            [expected_person, expected_person_2],
            converter.convert_schema_objects_to_entity(persons),
        )
예제 #22
0
    def test_readPersonAndAllRelationships(self):
        # Arrange
        metadata = IngestMetadata.new_with_defaults(
            region=REGION_1,
            jurisdiction_id=JURISDICTION_ID,
            ingest_time=SCRAPER_START_DATETIME)

        ingest_info = IngestInfo()
        ingest_info.people.add(full_name=FULL_NAME_1,
                               booking_ids=['BOOKING_ID'])
        ingest_info.bookings.add(booking_id='BOOKING_ID',
                                 facility=FACILITY,
                                 custody_status=BOOKING_CUSTODY_STATUS,
                                 arrest_id='ARREST_ID',
                                 charge_ids=['CHARGE_ID_1', 'CHARGE_ID_2'])
        ingest_info.arrests.add(arrest_id='ARREST_ID',
                                officer_name=OFFICER_NAME)

        ingest_info.bonds.add(bond_id='SHARED_BOND_ID',
                              bond_type=BOND_TYPE,
                              status=BOND_STATUS)

        ingest_info.charges.extend([
            Charge(charge_id='CHARGE_ID_1',
                   name=CHARGE_NAME_1,
                   status=CHARGE_STATUS,
                   bond_id='SHARED_BOND_ID',
                   sentence_id='SENTENCE_ID_1'),
            Charge(charge_id='CHARGE_ID_2',
                   name=CHARGE_NAME_2,
                   status=CHARGE_STATUS,
                   bond_id='SHARED_BOND_ID',
                   sentence_id='SENTENCE_ID_2')
        ])

        ingest_info.sentences.extend([
            Sentence(sentence_id='SENTENCE_ID_1',
                     fine_dollars=FINE_1,
                     status=SENTENCE_STATUS),
            Sentence(sentence_id='SENTENCE_ID_2',
                     fine_dollars=FINE_2,
                     status=SENTENCE_STATUS)
        ])

        # Act
        persistence.write(ingest_info, metadata)
        result = county_dao.read_people(
            SessionFactory.for_schema_base(JailsBase))

        # Assert
        assert len(result) == 1
        result_person = result[0]
        assert result_person.full_name == _format_full_name(FULL_NAME_1)

        assert len(result_person.bookings) == 1
        result_booking = result_person.bookings[0]
        assert result_booking.facility == FACILITY
        assert result_booking.last_seen_time == SCRAPER_START_DATETIME

        result_arrest = result_booking.arrest
        assert result_arrest.officer_name == OFFICER_NAME

        result_charges = result_booking.charges
        assert len(result_charges) == 2
        assert result_charges[0].name == CHARGE_NAME_1
        assert result_charges[1].name == CHARGE_NAME_2

        bond_1 = result_charges[0].bond
        bond_2 = result_charges[1].bond
        assert bond_1.bond_type == bond_2.bond_type

        sentence_1 = result_charges[0].sentence
        sentence_2 = result_charges[1].sentence
        assert sentence_1.fine_dollars == FINE_1_INT
        assert sentence_2.fine_dollars == FINE_2_INT
예제 #23
0
    def test_state_threeSentenceGroups_persistsTwoBelowThreshold(
            self, mock_get_matcher):
        """Ensure that the number of errors is below the ND specific threshold"""
        mock_get_matcher.return_value = _PatchedStateEntityMatcher(
            region_code=STATE_CODE,
            erroring_class=schema.StateSentenceGroup,
            erroring_external_ids=[SENTENCE_GROUP_ID],
        )

        # Set the ENTITY_MATCHING_THRESHOLD to 0, such that we can verify that the forty percent threshold for
        # ENTITY_MATCHING_THRESHOLD is dictated by the state-specific override in
        # STATE_CODE_TO_ENTITY_MATCHING_THRESHOLD_FORTY_PERCENT.
        STATE_ERROR_THRESHOLDS_WITH_FORTY_PERCENT_RATIOS[
            ENTITY_MATCHING_THRESHOLD] = 0

        # Arrange
        ingest_info = IngestInfo()
        ingest_info.state_people.add(
            state_person_id="1_GENERATE",
            state_sentence_group_ids=[SENTENCE_GROUP_ID, SENTENCE_GROUP_ID_2],
        )
        ingest_info.state_people.add(
            state_person_id="2_GENERATE",
            state_sentence_group_ids=[SENTENCE_GROUP_ID_3])
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID, county_code=COUNTY_CODE)
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID_2,
            county_code=COUNTY_CODE)
        ingest_info.state_sentence_groups.add(
            state_sentence_group_id=SENTENCE_GROUP_ID_3,
            county_code=COUNTY_CODE)

        db_person = schema.StatePerson(person_id=ID,
                                       full_name=FULL_NAME_1,
                                       state_code=STATE_CODE)
        db_sentence_group = schema.StateSentenceGroup(
            sentence_group_id=ID,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID,
            state_code=STATE_CODE,
        )
        db_sentence_group_2 = schema.StateSentenceGroup(
            sentence_group_id=ID_2,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_2,
            state_code=STATE_CODE,
        )
        db_external_id = schema.StatePersonExternalId(
            person_external_id_id=ID,
            state_code=STATE_CODE,
            external_id=EXTERNAL_ID,
            id_type=ID_TYPE,
        )
        db_person.sentence_groups = [db_sentence_group, db_sentence_group_2]
        db_person.external_ids = [db_external_id]

        db_person_2 = schema.StatePerson(person_id=ID_2,
                                         full_name=FULL_NAME_1,
                                         state_code=STATE_CODE)
        db_sentence_group_3 = schema.StateSentenceGroup(
            sentence_group_id=ID_3,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN.value,
            external_id=SENTENCE_GROUP_ID_3,
            state_code=STATE_CODE,
        )
        db_external_id_2 = schema.StatePersonExternalId(
            person_external_id_id=ID_2,
            state_code=STATE_CODE,
            external_id=EXTERNAL_ID_2,
            id_type=ID_TYPE,
        )
        db_person_2.external_ids = [db_external_id_2]
        db_person_2.sentence_groups = [db_sentence_group_3]

        session = SessionFactory.for_schema_base(StateBase)
        session.add(db_person)
        session.add(db_person_2)
        session.commit()

        expected_person = StatePerson.new_with_defaults(
            person_id=ID,
            full_name=FULL_NAME_1,
            external_ids=[],
            sentence_groups=[],
            state_code=STATE_CODE,
        )
        expected_external_id = StatePersonExternalId.new_with_defaults(
            person_external_id_id=ID,
            state_code=STATE_CODE,
            external_id=EXTERNAL_ID,
            id_type=ID_TYPE,
            person=expected_person,
        )
        # No county code because errors during match
        expected_sentence_group = StateSentenceGroup.new_with_defaults(
            sentence_group_id=ID,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN,
            external_id=SENTENCE_GROUP_ID,
            state_code=STATE_CODE,
            person=expected_person,
        )
        expected_sentence_group_2 = StateSentenceGroup.new_with_defaults(
            sentence_group_id=ID_2,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN,
            external_id=SENTENCE_GROUP_ID_2,
            state_code=STATE_CODE,
            county_code=COUNTY_CODE,
            person=expected_person,
        )
        expected_person.external_ids = [expected_external_id]
        expected_person.sentence_groups = [
            expected_sentence_group,
            expected_sentence_group_2,
        ]

        expected_person_2 = StatePerson.new_with_defaults(
            person_id=ID_2, full_name=FULL_NAME_1, state_code=STATE_CODE)
        expected_external_id_2 = StatePersonExternalId.new_with_defaults(
            person_external_id_id=ID_2,
            state_code=STATE_CODE,
            external_id=EXTERNAL_ID_2,
            id_type=ID_TYPE,
            person=expected_person_2,
        )
        expected_sentence_group_3 = StateSentenceGroup.new_with_defaults(
            sentence_group_id=ID_3,
            status=StateSentenceStatus.EXTERNAL_UNKNOWN,
            external_id=SENTENCE_GROUP_ID_3,
            state_code=STATE_CODE,
            county_code=COUNTY_CODE,
            person=expected_person_2,
        )
        expected_person_2.sentence_groups = [expected_sentence_group_3]
        expected_person_2.external_ids = [expected_external_id_2]

        # Act
        persistence.write(ingest_info, DEFAULT_METADATA)
        session = SessionFactory.for_schema_base(StateBase)
        persons = dao.read_people(session)

        # Assert
        self.assertEqual(
            [expected_person, expected_person_2],
            converter.convert_schema_objects_to_entity(persons),
        )