def test_persist_to_db_different_regions(self, mock_write, _mock_region,
                                             mock_session_return):
        scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        ii2 = ingest_info.IngestInfo()
        ii2.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME2).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        mock_session_1 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii, scrape_key1, t)
        expected_proto = serialization.convert_ingest_info_to_proto(ii)
        batch_persistence.persist_to_database(scrape_key1.region_code,
                                              mock_session_1.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # We expect the region that we persisted to have no more ingest infos.
        ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session_1.start)
        self.assertEqual(len(ingest_infos_1), 0)

        mock_session_2 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii2, scrape_key2, t2)
        ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[1], mock_session_2.start)
        self.assertEqual(len(ingest_infos_2), 1)

        expected_proto = serialization.convert_ingest_info_to_proto(ii2)
        batch_persistence.persist_to_database(scrape_key2.region_code,
                                              mock_session_2.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        self.assertEqual(mock_write.call_count, 2)
    def test_persist_to_db(self, mock_write, _mock_region,
                           mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        batch_persistence.write(ii, scrape_key, t)

        expected_proto = serialization.convert_ingest_info_to_proto(ii)

        batch_persistence.persist_to_database(scrape_key.region_code,
                                              mock_session.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # After we persist, there should no longer be ingest infos on Datastore
        ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session.start)
        self.assertEqual(len(ingest_infos), 0)
    def validate_ingest(
        self,
        ingest_info: IngestInfo,
        expected_ingest_info: IngestInfo,
        metadata: IngestMetadata,
    ) -> IngestInfo:
        """This function runs validation on a computed and expected ingest_info.

        Args:
            ingest_info: the computed ingest info object
            expected_ingest_info: the ingest info expected to be returned from
                `populate_data`. If `expected_ingest_info` is `None`, then
                expects the return value of `populate_data` to be `None`.
            metadata: an ingest info metadata struct to pass along to the proto
                converter.

        Returns:
            The result from populate_data in case the user needs to do any
            extra validations on the output.

        """

        if expected_ingest_info is None:
            assert ingest_info == expected_ingest_info
            return ingest_info

        # Attempt to convert the ingest_info to the ingest info proto,
        # validate the proto, and finally attempt to convert the proto into
        # our entitiy/ objects (which includes parsing strings into types)
        ingest_info_proto = serialization.convert_ingest_info_to_proto(
            ingest_info)
        validate(ingest_info_proto)
        res = ingest_info_converter.convert_to_persistence_entities(
            ingest_info_proto, metadata)

        assert res.enum_parsing_errors == 0
        assert res.general_parsing_errors == 0
        assert res.protected_class_errors == 0

        entity_validator.validate(res.people)

        differences = diff_ingest_infos(expected_ingest_info, ingest_info)

        if differences:
            self.fail(  # type: ignore[attr-defined]
                "IngestInfo objects do not match.\n"
                "Expected:\n{}\n"
                "Actual:\n{}\n"
                "Differences:\n{}\n\n"
                "(paste the following) scraped object:"
                "\n{}".format(
                    expected_ingest_info,
                    ingest_info,
                    "\n".join(differences),
                    repr(ingest_info),
                ))

        return ingest_info
示例#4
0
    def test_scrape_data_and_more_yes_persist(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=TEST_TASK,
                scraper_start_time=start_time,
            )
        ]
        expected_metadata = IngestMetadata(
            region=scraper.region.region_code,
            jurisdiction_id=scraper.region.jurisdiction_id,
            ingest_time=start_time,
            enum_overrides=scraper.get_enum_overrides(),
            system_level=SystemLevel.COUNTY,
            database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertCountEqual(expected_tasks, scraper.tasks)
    def test_convert_ingest_info_one_charge_to_one_bond(
        self, mock_create: Mock
    ) -> None:
        mock_create.side_effect = self._create_generated_id
        info = ingest_info.IngestInfo()
        person = info.create_person()
        person.person_id = "id1"

        booking = person.create_booking()
        booking.booking_id = "id1"
        charge = booking.create_charge()
        charge.charge_id = "id1"
        bond1 = charge.create_bond()
        bond1.amount = "$1"
        charge = booking.create_charge()
        charge.charge_id = "id2"
        bond2 = charge.create_bond()
        bond2.amount = "$1"

        expected_proto = ingest_info_pb2.IngestInfo()
        proto_person = expected_proto.people.add()
        proto_person.person_id = "id1"
        proto_person.booking_ids.append("id1")
        proto_booking = expected_proto.bookings.add()
        proto_booking.booking_id = "id1"
        proto_booking.charge_ids.extend(["id1", "id2"])
        proto_charge = expected_proto.charges.add()
        proto_charge.charge_id = "id1"
        proto_bond1 = expected_proto.bonds.add()
        proto_bond1.amount = "$1"
        proto_bond1.bond_id = "1_GENERATE"
        proto_charge.bond_id = proto_bond1.bond_id
        proto_charge = expected_proto.charges.add()
        proto_charge.charge_id = "id2"
        proto_bond2 = expected_proto.bonds.add()
        proto_bond2.amount = "$1"
        proto_bond2.bond_id = "2_GENERATE"
        proto_charge.bond_id = proto_bond2.bond_id

        proto = serialization.convert_ingest_info_to_proto(info)
        assert expected_proto == proto

        info_back = serialization.convert_proto_to_ingest_info(proto)
        assert info_back == info
    def test_convert_ingest_info_id_is_generated(self, mock_create: Mock) -> None:
        mock_create.side_effect = self._create_generated_id
        info = ingest_info.IngestInfo()
        person = info.create_person()
        person.surname = "testname"
        person.create_booking()

        expected_proto = ingest_info_pb2.IngestInfo()
        proto_person = expected_proto.people.add()
        proto_person.surname = "testname"
        proto_person.person_id = "1_GENERATE"
        proto_booking = expected_proto.bookings.add()
        proto_booking.booking_id = "2_GENERATE"
        proto_person.booking_ids.append(proto_booking.booking_id)

        proto = serialization.convert_ingest_info_to_proto(info)
        assert proto == expected_proto

        info_back = serialization.convert_proto_to_ingest_info(proto)
        assert info_back == info
    def test_persist_to_db_same_task_one_fail_one_pass(self, mock_write,
                                                       _mock_region,
                                                       mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        mock_write.return_value = True

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        # Because the tasks are the same, we expect that to be counted as a
        # pass.
        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        batch_persistence.write(ii, scrape_key, t)
        batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key)

        expected_proto = serialization.convert_ingest_info_to_proto(ii)

        self.assertTrue(
            batch_persistence.persist_to_database(scrape_key.region_code,
                                                  mock_session.start))

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session.start)
        self.assertEqual(len(ingest_infos), 0)
    def test_convert_ingest_info_id_is_not_generated(self) -> None:
        info = ingest_info.IngestInfo()
        person = info.create_person()
        person.person_id = "id1"
        person.surname = "testname"
        booking = person.create_booking()
        booking.booking_id = "id2"
        booking.admission_date = "testdate"

        expected_proto = ingest_info_pb2.IngestInfo()
        proto_person = expected_proto.people.add()
        proto_person.person_id = "id1"
        proto_person.surname = "testname"
        proto_person.booking_ids.append("id2")
        proto_booking = expected_proto.bookings.add()
        proto_booking.booking_id = "id2"
        proto_booking.admission_date = "testdate"

        proto = serialization.convert_ingest_info_to_proto(info)
        assert expected_proto == proto

        info_back = serialization.convert_proto_to_ingest_info(proto)
        assert info_back == info
示例#9
0
def _get_proto_from_batch_ingest_info_data_list(
    batch_ingest_info_data_list: List[BatchIngestInfoData],
) -> Tuple[ingest_info_pb2.IngestInfo, Dict[int, BatchIngestInfoData]]:
    """Merges an ingest_info_proto from all of the batched ingest_infos.

    Args:
        batch_ingest_info_data_list: A list of BatchIngestInfoData.
    Returns:
        an IngestInfo proto with data from all of the messages.
    """
    logging.info("Starting generation of proto")
    ingest_infos: List[IngestInfo] = []
    successful_tasks: Set[int] = set()
    failed_tasks: Dict[int, BatchIngestInfoData] = {}
    for batch_ingest_info_datum in batch_ingest_info_data_list:
        # We do this because dicts are not hashable in python and we want to
        # avoid an n2 operation to see which tasks have been seen previously
        # which can be on the order of a million operations.
        task_hash = batch_ingest_info_datum.task_hash
        if not batch_ingest_info_datum.error and task_hash not in successful_tasks:
            successful_tasks.add(task_hash)
            if task_hash in failed_tasks:
                del failed_tasks[task_hash]
            if batch_ingest_info_datum.ingest_info:
                ingest_infos.append(batch_ingest_info_datum.ingest_info)
        else:
            # We only add to failed if we didn't see a successful one. This is
            # because its possible a task ran 3 times before passing, meaning
            # we don't want to fail on that when we see the failed ones.
            if task_hash not in successful_tasks:
                failed_tasks[task_hash] = batch_ingest_info_datum

    deduped_ingest_info = _dedup_people(ingest_infos)
    base_proto = serialization.convert_ingest_info_to_proto(deduped_ingest_info)
    ingest_info_validator.validate(base_proto)
    logging.info("Generated proto for [%s] people", len(base_proto.people))
    return base_proto, failed_tasks
    def test_persist_duplicates_to_db(self, mock_write, _mock_region,
                                      mock_session_return):
        """Tests that duplicate ingest_info.Person objects are merged before
        write."""
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        # Arrange
        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        ii_2 = ingest_info.IngestInfo()
        ii.create_person(person_id=TEST_ID2, full_name=TEST_NAME2)

        ii_1_dup = copy.deepcopy(ii)

        t1, t2, t3 = (Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT + str(i),
            response_type=constants.ResponseType.TEXT,
        ) for i in range(3))

        batch_persistence.write(ii, scrape_key, t1)
        batch_persistence.write(ii_2, scrape_key, t2)
        batch_persistence.write(ii_1_dup, scrape_key, t3)

        batch_persistence.persist_to_database(scrape_key.region_code,
                                              mock_session.start)

        expected_ii = ingest_info.IngestInfo(people=ii.people + ii_2.people)
        expected_proto = serialization.convert_ingest_info_to_proto(
            expected_ii)
        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)
示例#11
0
    def _parse_and_persist_contents(
            self, args: GcsfsIngestArgs,
            contents_handle: GcsfsFileContentsHandle) -> None:
        """
        Runs the full ingest process for this controller for files with
        non-empty contents.
        """
        ii = self._parse(args, contents_handle)
        if not ii:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PARSE_ERROR,
                msg="No IngestInfo after parse.",
            )

        logging.info("Successfully parsed data for ingest run [%s]",
                     self._job_tag(args))

        ingest_info_proto = serialization.convert_ingest_info_to_proto(ii)

        logging.info(
            "Successfully converted ingest_info to proto for ingest "
            "run [%s]",
            self._job_tag(args),
        )

        ingest_metadata = self._get_ingest_metadata(args)
        persist_success = persistence.write(ingest_info_proto, ingest_metadata)

        if not persist_success:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PERSISTENCE_ERROR,
                msg="Persist step failed",
            )

        logging.info("Successfully persisted for ingest run [%s]",
                     self._job_tag(args))
示例#12
0
    def test_convert_ingest_info_state_entities(self) -> None:
        # Arrange Python ingest info
        info = ingest_info.IngestInfo()
        person = info.create_state_person()
        person.state_person_id = "person1"
        person.surname = "testname"

        race = person.create_state_person_race()
        race.state_person_race_id = "race1"
        race.race = "white"
        ethnicity = person.create_state_person_ethnicity()
        ethnicity.state_person_ethnicity_id = "ethnicity1"
        ethnicity.ethnicity = "non-hispanic"
        external_id = person.create_state_person_external_id()
        external_id.state_person_external_id_id = "external_id1"
        external_id.id_type = "contrived"
        alias = person.create_state_alias()
        alias.state_alias_id = "alias1"
        alias.surname = "testerson"
        assessment = person.create_state_assessment()
        assessment.state_assessment_id = "assessment1"
        assessment.assessment_score = "42"
        supervising_officer = person.create_state_agent()
        supervising_officer.state_agent_id = "supervising_officer1"
        supervising_officer.full_name = "Officer Supervising"

        assessment_agent = assessment.create_state_agent()
        assessment_agent.state_agent_id = "agent1"
        assessment_agent.full_name = "Officer Jones"

        program_assignment = person.create_state_program_assignment()
        program_assignment.state_program_assignment_id = "assignment1"
        program_assignment.program_id = "program_id1"

        program_assignment_agent = program_assignment.create_state_agent()
        program_assignment_agent.state_agent_id = "program_agent1"
        program_assignment_agent.full_name = "Officer Program"

        group = person.create_state_sentence_group()
        group.state_sentence_group_id = "group1"

        fine = group.create_state_fine()
        fine.state_fine_id = "fine1"

        incarceration_sentence = group.create_state_incarceration_sentence()
        incarceration_sentence.state_incarceration_sentence_id = "is1"
        early_discharge1 = incarceration_sentence.create_state_early_discharge()
        early_discharge1.state_early_discharge_id = "early_discharge1"
        charge1 = incarceration_sentence.create_state_charge()
        charge1.state_charge_id = "charge1"
        charge1.classification_type = "F"
        incarceration_period = (
            incarceration_sentence.create_state_incarceration_period()
        )
        incarceration_period.state_incarceration_period_id = "ip1"
        incarceration_period.status = "IN_CUSTODY"
        incarceration_period.specialized_purpose_for_incarceration = (
            "SHOCK INCARCERATION"
        )
        incarceration_period.state_program_assignments = [program_assignment]
        incident = incarceration_period.create_state_incarceration_incident()
        incident.state_incarceration_incident_id = "incident1"
        incident.incident_type = "FISTICUFFS"
        incident_outcome = incident.create_state_incarceration_incident_outcome()
        incident_outcome.state_incarceration_incident_outcome_id = "incident1-1"
        incident_outcome.outcome_type = "FINE"

        incident_agent = incident.create_state_agent()
        incident_agent.state_agent_id = "agent2"
        incident_agent.full_name = "Officer Thompson"

        decision = incarceration_period.create_state_parole_decision()
        decision.state_parole_decision_id = "decision1"

        decision_agent = decision.create_state_agent()
        decision_agent.state_agent_id = "agent3"
        decision_agent.full_name = "Officer Barkley"

        supervision_sentence = group.create_state_supervision_sentence()
        supervision_sentence.state_supervision_sentence_id = "ss1"
        early_discharge2 = supervision_sentence.create_state_early_discharge()
        early_discharge2.state_early_discharge_id = "early_discharge2"
        charge2 = supervision_sentence.create_state_charge()
        charge2.state_charge_id = "charge2"
        charge2.classification_type = "M"
        supervision_period = supervision_sentence.create_state_supervision_period()
        supervision_period.state_supervision_period_id = "sp1"
        supervision_period.status = "TERMINATED"
        supervision_period_agent = supervision_period.create_state_agent()
        supervision_period_agent.state_agent_id = "agentPO"
        supervision_period_agent.full_name = "Officer Paroley"
        supervision_period.state_program_assignments = [program_assignment]

        supervision_case_type_entry = (
            supervision_period.create_state_supervision_case_type_entry()
        )
        supervision_case_type_entry.case_type = "case_type"
        supervision_case_type_entry.state_supervision_case_type_entry_id = (
            "case_type_entry_id"
        )

        supervision_contact = supervision_period.create_state_supervision_contact()
        supervision_contact.state_supervision_contact_id = "supervision_contact_id"
        supervision_contact.contact_type = "contact_type"
        supervision_contacted_agent = supervision_contact.create_state_agent()
        supervision_contacted_agent.state_agent_id = "agentPO"
        supervision_contacted_agent.full_name = "Officer Paroley"

        violation = supervision_period.create_state_supervision_violation()
        violation.state_supervision_violation_id = "violation1"
        violation.violated_conditions = "cond"
        violation.is_violent = "false"

        violation_type = violation.create_state_supervision_violation_type_entry()
        violation_type.state_supervision_violation_type_entry_id = "violation_type_id"
        violation_type.violation_type = "FELONY"

        violated_condition = (
            violation.create_state_supervision_violated_condition_entry()
        )
        violated_condition.state_supervision_violated_condition_entry_id = (
            "condition_id"
        )
        violated_condition.condition = "CURFEW"

        response = violation.create_state_supervision_violation_response()
        response.state_supervision_violation_response_id = "response1"
        response_decision_agent = response.create_state_agent()
        response_decision_agent.state_agent_id = "agentTERM"
        response_decision_agent.full_name = "Officer Termy"

        response_decision = (
            response.create_state_supervision_violation_response_decision_entry()
        )
        response_decision.state_supervision_violation_response_decision_entry_id = (
            "response_decision_id"
        )
        response_decision.decision = "REVOCATION"
        response_decision.revocation_type = "REINCARCERATION"

        bond = charge1.create_state_bond()
        bond.state_bond_id = "bond1"

        court_case = charge2.create_state_court_case()
        court_case.state_court_case_id = "case1"

        court_case_agent = court_case.create_state_agent()
        court_case_agent.state_agent_id = "agentJ"
        court_case_agent.full_name = "Judge Agent"

        # Arrange Proto ingest info
        expected_proto = ingest_info_pb2.IngestInfo()
        person_pb = expected_proto.state_people.add()
        person_pb.state_person_id = "person1"
        person_pb.surname = "testname"

        person_pb.state_person_race_ids.append("race1")
        race_pb = expected_proto.state_person_races.add()
        race_pb.state_person_race_id = "race1"
        race_pb.race = "white"
        person_pb.state_person_ethnicity_ids.append("ethnicity1")
        ethnicity_pb = expected_proto.state_person_ethnicities.add()
        ethnicity_pb.state_person_ethnicity_id = "ethnicity1"
        ethnicity_pb.ethnicity = "non-hispanic"
        person_pb.state_person_external_ids_ids.append("contrived:external_id1")
        external_id_pb = expected_proto.state_person_external_ids.add()
        external_id_pb.state_person_external_id_id = "contrived:external_id1"
        external_id_pb.id_type = "contrived"
        person_pb.state_alias_ids.append("alias1")
        alias_pb = expected_proto.state_aliases.add()
        alias_pb.state_alias_id = "alias1"
        alias_pb.surname = "testerson"
        person_pb.state_assessment_ids.append("assessment1")
        assessment_pb = expected_proto.state_assessments.add()
        assessment_pb.state_assessment_id = "assessment1"
        assessment_pb.assessment_score = "42"
        person_pb.supervising_officer_id = "supervising_officer1"
        supervising_officer_pb = expected_proto.state_agents.add()
        supervising_officer_pb.state_agent_id = "supervising_officer1"
        supervising_officer_pb.full_name = "Officer Supervising"

        assessment_pb.conducting_agent_id = "agent1"
        assessment_agent_pb = expected_proto.state_agents.add()
        assessment_agent_pb.state_agent_id = "agent1"
        assessment_agent_pb.full_name = "Officer Jones"

        person_pb.state_program_assignment_ids.append("assignment1")
        program_assignment_pb = expected_proto.state_program_assignments.add()
        program_assignment_pb.state_program_assignment_id = "assignment1"
        program_assignment_pb.program_id = "program_id1"
        program_assignment_pb.referring_agent_id = "program_agent1"
        program_assignment_agent_pb = expected_proto.state_agents.add()
        program_assignment_agent_pb.state_agent_id = "program_agent1"
        program_assignment_agent_pb.full_name = "Officer Program"

        person_pb.state_sentence_group_ids.append("group1")
        group_pb = expected_proto.state_sentence_groups.add()
        group_pb.state_sentence_group_id = "group1"

        group_pb.state_fine_ids.append("fine1")
        fine_pb = expected_proto.state_fines.add()
        fine_pb.state_fine_id = "fine1"

        group_pb.state_supervision_sentence_ids.append("ss1")
        supervision_sentence_pb = expected_proto.state_supervision_sentences.add()
        supervision_sentence_pb.state_supervision_sentence_id = "ss1"
        supervision_sentence_pb.state_early_discharge_ids.append("early_discharge2")
        early_discharge2_pb = expected_proto.state_early_discharges.add()
        early_discharge2_pb.state_early_discharge_id = "early_discharge2"
        supervision_sentence_pb.state_charge_ids.append("charge2")
        charge2_pb = expected_proto.state_charges.add()
        charge2_pb.state_charge_id = "charge2"
        charge2_pb.classification_type = "M"
        supervision_sentence_pb.state_supervision_period_ids.append("sp1")
        supervision_period_pb = expected_proto.state_supervision_periods.add()
        supervision_period_pb.state_supervision_period_id = "sp1"
        supervision_period_pb.status = "TERMINATED"
        supervision_period_pb.state_program_assignment_ids.append("assignment1")

        # An ordering requirement in the proto equality check at the end of this
        # test requires that this agent be added after agent1 and before agentPO
        court_case_agent_pb = expected_proto.state_agents.add()
        court_case_agent_pb.state_agent_id = "agentJ"
        court_case_agent_pb.full_name = "Judge Agent"

        supervision_period_pb.supervising_officer_id = "agentPO"
        supervision_period_agent_pb = expected_proto.state_agents.add()
        supervision_period_agent_pb.state_agent_id = "agentPO"
        supervision_period_agent_pb.full_name = "Officer Paroley"

        supervision_case_type_entry_pb = (
            expected_proto.state_supervision_case_type_entries.add()
        )
        supervision_case_type_entry_pb.state_supervision_case_type_entry_id = (
            "case_type_entry_id"
        )
        supervision_case_type_entry_pb.case_type = "case_type"
        supervision_period_pb.state_supervision_case_type_entry_ids.append(
            "case_type_entry_id"
        )

        supervision_contact_pb = expected_proto.state_supervision_contacts.add()
        supervision_contact_pb.state_supervision_contact_id = "supervision_contact_id"
        supervision_contact_pb.contact_type = "contact_type"
        supervision_contact_pb.contacted_agent_id = "agentPO"
        supervision_period_pb.state_supervision_contact_ids.append(
            "supervision_contact_id"
        )

        supervision_period_pb.state_supervision_violation_entry_ids.append("violation1")
        violation_pb = expected_proto.state_supervision_violations.add()
        violation_pb.state_supervision_violation_id = "violation1"
        violation_pb.is_violent = "false"
        violation_pb.violated_conditions = "cond"
        violation_pb.state_supervision_violation_type_entry_ids.append(
            "violation_type_id"
        )
        violation_type_pb = (
            expected_proto.state_supervision_violation_type_entries.add()
        )
        violation_type_pb.state_supervision_violation_type_entry_id = (
            "violation_type_id"
        )
        violation_type_pb.violation_type = "FELONY"

        violation_pb.state_supervision_violated_condition_entry_ids.append(
            "condition_id"
        )
        proto_violation_type_pb = (
            expected_proto.state_supervision_violated_condition_entries.add()
        )
        proto_violation_type_pb.state_supervision_violated_condition_entry_id = (
            "condition_id"
        )
        proto_violation_type_pb.condition = "CURFEW"

        violation_pb.state_supervision_violation_response_ids.append("response1")
        response_pb = expected_proto.state_supervision_violation_responses.add()
        response_pb.state_supervision_violation_response_id = "response1"
        response_pb.decision_agent_ids.append("agentTERM")
        response_decision_agent_pb = expected_proto.state_agents.add()
        response_decision_agent_pb.state_agent_id = "agentTERM"
        response_decision_agent_pb.full_name = "Officer Termy"
        response_decision_pb = (
            expected_proto.state_supervision_violation_response_decision_entries.add()
        )
        response_decision_pb.state_supervision_violation_response_decision_entry_id = (
            "response_decision_id"
        )
        response_decision_pb.decision = "REVOCATION"
        response_decision_pb.revocation_type = "REINCARCERATION"
        response_pb.state_supervision_violation_response_decision_entry_ids.append(
            "response_decision_id"
        )

        group_pb.state_incarceration_sentence_ids.append("is1")
        incarceration_sentence_pb = expected_proto.state_incarceration_sentences.add()
        incarceration_sentence_pb.state_incarceration_sentence_id = "is1"
        incarceration_sentence_pb.state_early_discharge_ids.append("early_discharge1")
        early_discharge1_pb = expected_proto.state_early_discharges.add()
        early_discharge1_pb.state_early_discharge_id = "early_discharge1"
        incarceration_sentence_pb.state_charge_ids.append("charge1")
        charge1_pb = expected_proto.state_charges.add()
        charge1_pb.state_charge_id = "charge1"
        charge1_pb.classification_type = "F"
        incarceration_sentence_pb.state_incarceration_period_ids.append("ip1")
        incarceration_period_pb = expected_proto.state_incarceration_periods.add()
        incarceration_period_pb.state_incarceration_period_id = "ip1"
        incarceration_period_pb.status = "IN_CUSTODY"
        incarceration_period_pb.specialized_purpose_for_incarceration = (
            "SHOCK INCARCERATION"
        )
        incarceration_period_pb.state_incarceration_incident_ids.append("incident1")
        incident_pb = expected_proto.state_incarceration_incidents.add()
        incident_pb.state_incarceration_incident_id = "incident1"
        incident_pb.incident_type = "FISTICUFFS"
        incarceration_period_pb.state_program_assignment_ids.append("assignment1")

        incident_pb.responding_officer_id = "agent2"
        incident_agent_pb = expected_proto.state_agents.add()
        incident_agent_pb.state_agent_id = "agent2"
        incident_agent_pb.full_name = "Officer Thompson"

        incident_pb.state_incarceration_incident_outcome_ids.append("incident1-1")
        incident_outcome_pb = expected_proto.state_incarceration_incident_outcomes.add()
        incident_outcome_pb.state_incarceration_incident_outcome_id = "incident1-1"
        incident_outcome_pb.outcome_type = "FINE"

        incarceration_period_pb.state_parole_decision_ids.append("decision1")
        decision_pb = expected_proto.state_parole_decisions.add()
        decision_pb.state_parole_decision_id = "decision1"

        decision_pb.decision_agent_ids.append("agent3")
        decision_agent_pb = expected_proto.state_agents.add()
        decision_agent_pb.state_agent_id = "agent3"
        decision_agent_pb.full_name = "Officer Barkley"

        charge1_pb.state_bond_id = "bond1"
        bond_pb = expected_proto.state_bonds.add()
        bond_pb.state_bond_id = "bond1"

        charge2_pb.state_court_case_id = "case1"
        court_case_pb = expected_proto.state_court_cases.add()
        court_case_pb.state_court_case_id = "case1"

        court_case_pb.judge_id = "agentJ"

        expected_info = copy.deepcopy(info)
        # Act & Assert

        proto = serialization.convert_ingest_info_to_proto(info)
        assert expected_proto == proto

        info_back = serialization.convert_proto_to_ingest_info(proto)
        assert info_back == expected_info

        # Assert that none of the proto's collections are empty, i.e. we've
        # tested all of the object graph
        proto_classes = [field.name for field in proto.DESCRIPTOR.fields]
        for cls in proto_classes:
            if cls.startswith("state_"):
                assert proto.__getattribute__(cls)
示例#13
0
    def _generic_scrape(self, request: QueueRequest):
        """
        General handler for all scrape tasks.  This function is a generic entry
        point into all types of scrapes.  It decides what to call based on
        params.

        Args:
            params: dict of parameters passed from the last scrape session.
        """
        try:
            task = request.next_task

            # Here we handle a special case where we weren't really sure
            # we were going to get data when we submitted a task, but then
            # we ended up with data, so no more requests are required,
            # just the content we already have.
            # TODO(#680): remove this
            if task.content is not None:
                content = self._parse_html_content(task.content)
                cookies = None
            else:
                post_data = task.post_data

                # Let the child transform the post_data if it wants before
                # sending the requests.  This hook is in here in case the
                # child did something like compress the post_data before
                # it put it on the queue.
                self.transform_post_data(post_data)

                # We always fetch some content before doing anything.
                # Note that we use get here for the post_data to return a
                # default value of None if this scraper doesn't set it.
                try:
                    content, cookies = self._fetch_content(
                        task.endpoint,
                        task.response_type,
                        headers=task.headers,
                        cookies=task.cookies,
                        params=task.params,
                        post_data=post_data,
                        json_data=task.json,
                    )
                except Exception as e:
                    raise ScraperFetchError(str(e)) from e

            scraped_data = None
            if self.should_scrape_data(task.task_type):
                # If we want to scrape data, we should either create an
                # ingest_info object or get the one that already exists.
                logging.info(
                    "Scraping data for [%s] and endpoint: [%s]",
                    self.region.region_code,
                    task.endpoint,
                )
                try:
                    scraped_data = self.populate_data(
                        content, task, request.ingest_info or IngestInfo())
                except Exception as e:
                    raise ScraperPopulateDataError(str(e)) from e

            if self.should_get_more_tasks(task.task_type):
                logging.info(
                    "Getting more tasks for [%s] and endpoint: [%s]",
                    self.region.region_code,
                    task.endpoint,
                )

                # Only send along ingest info if it will not be persisted now.
                ingest_info_to_send = None
                if scraped_data is not None and not scraped_data.persist:
                    ingest_info_to_send = scraped_data.ingest_info

                try:
                    # pylint: disable=assignment-from-no-return
                    next_tasks = self.get_more_tasks(content, task)
                except Exception as e:
                    raise ScraperGetMoreTasksError(str(e)) from e
                for next_task in next_tasks:
                    # Include cookies received from response, if any
                    if cookies:
                        cookies.update(next_task.cookies)
                        next_task = Task.evolve(next_task, cookies=cookies)
                    self.add_task(
                        "_generic_scrape",
                        QueueRequest(
                            scrape_type=request.scrape_type,
                            scraper_start_time=request.scraper_start_time,
                            next_task=next_task,
                            ingest_info=ingest_info_to_send,
                        ),
                    )

            if scraped_data is not None and scraped_data.persist:
                if scraped_data.ingest_info:
                    logging.info(
                        "Logging at most 4 people (were %d):",
                        len(scraped_data.ingest_info.people),
                    )
                    loop_count = min(
                        len(scraped_data.ingest_info.people),
                        constants.MAX_PEOPLE_TO_LOG,
                    )
                    for i in range(loop_count):
                        logging.info("[%s]",
                                     str(scraped_data.ingest_info.people[i]))
                    logging.info(
                        "Last seen time of person being set as: [%s]",
                        request.scraper_start_time,
                    )
                    metadata = IngestMetadata(
                        region=self.region.region_code,
                        jurisdiction_id=self.region.jurisdiction_id,
                        ingest_time=request.scraper_start_time,
                        enum_overrides=self.get_enum_overrides(),
                        system_level=SystemLevel.COUNTY,
                        database_key=SQLAlchemyDatabaseKey.for_schema(
                            SchemaType.JAILS),
                    )
                    if self.BATCH_WRITES:
                        logging.info(
                            "Queuing ingest_info ([%d] people) to "
                            "batch_persistence for [%s]",
                            len(scraped_data.ingest_info.people),
                            self.region.region_code,
                        )
                        scrape_key = ScrapeKey(self.region.region_code,
                                               request.scrape_type)
                        batch_persistence.write(
                            ingest_info=scraped_data.ingest_info,
                            scrape_key=scrape_key,
                            task=task,
                        )
                    else:
                        logging.info(
                            "Writing ingest_info ([%d] people) to the database"
                            " for [%s]",
                            len(scraped_data.ingest_info.people),
                            self.region.region_code,
                        )
                        persistence.write(
                            serialization.convert_ingest_info_to_proto(
                                scraped_data.ingest_info),
                            metadata,
                        )
                for sc in scraped_data.single_counts:
                    if not sc.date:
                        scrape_key = ScrapeKey(self.region.region_code,
                                               constants.ScrapeType.BACKGROUND)
                        session = sessions.get_current_session(scrape_key)
                        if session:
                            sc = attr.evolve(sc, date=session.start.date())
                    single_count.store_single_count(
                        sc, self.region.jurisdiction_id)
        except Exception as e:
            if self.BATCH_WRITES:
                scrape_key = ScrapeKey(self.region.region_code,
                                       request.scrape_type)
                batch_persistence.write_error(
                    error=str(e),
                    trace_id=get_trace_id_from_flask(),
                    task=task,
                    scrape_key=scrape_key,
                )
            raise e