def test_good_table(self):
        """Tests a well modelled table."""
        expected_info = IngestInfo()
        person = expected_info.create_person()
        person.birthdate = "1/15/2048"

        info = self.extract("good_table.html", "good_table.yaml")
        self.assertEqual(expected_info, info)
    def testParse(self):
        region = regions.get_region('us_ma_middlesex', is_direct_ingest=True)
        controller = region.get_ingestor()

        metadata = IngestMetadata(region.region_code, region.jurisdiction_id,
                                  _FAKE_START_TIME,
                                  controller.get_enum_overrides())

        ingest_info = UsMaMiddlesexParser().parse(_ROSTER_JSON)

        expected_info = IngestInfo()
        p1 = expected_info.create_person(
            person_id='12345       ',
            birthdate='1111-01-01 00:00:00.000',
            gender='M',
            ethnicity='HISPANIC',
            place_of_residence='123 ST DORCHESTER MA 01234     ')

        b1 = p1.create_booking(booking_id='1.0',
                               admission_date='2017-01-01 00:00:00.000',
                               admission_reason='BAIL MITTIMUS',
                               facility='MAIN      ')
        b1.create_charge(charge_id='1245.0',
                         statute='90/24/K',
                         name='OUI-LIQUOR, 2ND OFFENSE c90 ss24',
                         case_number='111.0',
                         court_type='Middlesex SC (81)',
                         charge_notes='Other')
        b1.create_charge(charge_id='1502.0',
                         offense_date='2017-01-28 00:00:00',
                         statute='90/23/J',
                         name='OUI while license suspended for OUI',
                         case_number='222.0',
                         court_type='Middlesex SC (81)',
                         charge_notes='Drug or Alcohol',
                         status='DISMISSED').create_bond(bond_id='12345.0')
        b1.create_hold(hold_id='00000.0',
                       jurisdiction_name='Middlesex SC (81)')

        p2 = expected_info.create_person(
            person_id='10472       ',
            birthdate='1111-02-02 00:00:00.000',
            gender='M',
            race='BLACK or AFRICAN AMERICAN',
            place_of_residence='456 ST MALDEN MA 98765      ')
        b2 = p2.create_booking(booking_id='333.0',
                               admission_date='2018-02-02 00:00:00.000',
                               admission_reason='SENTENCE MITTIMUS',
                               facility='MAIN      ')
        b2.create_arrest(agency='Cambridge PD')
        b2.create_charge(charge_id='12341234.0',
                         statute='269/10/J',
                         name='FIREARM, CARRY WITHOUT LICENSE c269 ss10',
                         case_number='555.0',
                         charge_notes='Other',
                         court_type='Cambridge DC (52)')

        self.validate_ingest(ingest_info, expected_info, metadata)
 def test_child_first(self):
     """Tests that in multi_key mappings (columns in a table), parent
     objects are created where needed."""
     expected_info = IngestInfo()
     p = expected_info.create_person()
     p.create_booking(admission_date='111').create_charge(name='AAA')
     p.create_booking(admission_date='222').create_charge(name='BBB')
     info = self.extract('child_first.html', 'child_first.yaml')
     self.assertEqual(expected_info, info)
Пример #4
0
    def test_bond_multi_key(self) -> None:
        expected_info = IngestInfo()
        booking = expected_info.create_person().create_booking()
        booking.create_charge().create_bond(bond_id="1", amount="10")
        booking.create_charge().create_bond(bond_id="2", amount="20")
        booking.create_charge().create_bond(bond_id="3", amount="30")

        info = self.extract("bonds.html", "bonds.yaml")
        self.assertEqual(expected_info, info)
Пример #5
0
 def test_child_first(self) -> None:
     """Tests that in multi_key mappings (columns in a table), parent
     objects are created where needed."""
     expected_info = IngestInfo()
     p = expected_info.create_person()
     p.create_booking(admission_date="111").create_charge(name="AAA")
     p.create_booking(admission_date="222").create_charge(name="BBB")
     info = self.extract("child_first.html", "child_first.yaml")
     self.assertEqual(expected_info, info)
Пример #6
0
    def test_th_rows(self) -> None:
        """Tests a yaml file with <th> keys in rows."""
        expected_info = IngestInfo()
        person = expected_info.create_person()
        person.race = "WHITE"
        person.gender = "M"

        info = self.extract("th_rows.html", "th_rows.yaml")
        self.assertEqual(expected_info, info)
    def test_bond_multi_key(self):
        expected_info = IngestInfo()
        booking = expected_info.create_person().create_booking()
        booking.create_charge().create_bond(bond_id='1', amount='10')
        booking.create_charge().create_bond(bond_id='2', amount='20')
        booking.create_charge().create_bond(bond_id='3', amount='30')

        info = self.extract('bonds.html', 'bonds.yaml')
        self.assertEqual(expected_info, info)
    def test_th_rows(self):
        """Tests a yaml file with <th> keys in rows."""
        expected_info = IngestInfo()
        person = expected_info.create_person()
        person.race = 'WHITE'
        person.gender = 'M'

        info = self.extract('th_rows.html', 'th_rows.yaml')
        self.assertEqual(expected_info, info)
Пример #9
0
    def test_readPeopleWithOpenBookings(self):
        admission_date = datetime.datetime(2018, 6, 20)
        release_date = datetime.date(2018, 7, 20)

        open_booking = Booking(
            custody_status=CustodyStatus.IN_CUSTODY.value,
            admission_date=admission_date,
            first_seen_time=admission_date,
            last_seen_time=admission_date,
        )
        closed_booking = Booking(
            custody_status=CustodyStatus.RELEASED.value,
            admission_date=admission_date,
            release_date=release_date,
            first_seen_time=admission_date,
            last_seen_time=admission_date,
        )

        person_no_match = Person(
            person_id=1,
            region=_REGION,
            jurisdiction_id=_JURISDICTION_ID,
            bookings=[deepcopy(open_booking)],
        )
        person_match_full_name = Person(
            person_id=2,
            region=_REGION,
            jurisdiction_id=_JURISDICTION_ID,
            bookings=[deepcopy(open_booking)],
            full_name=_FULL_NAME,
        )
        person_no_open_bookings = Person(
            person_id=6,
            region=_REGION,
            jurisdiction_id=_JURISDICTION_ID,
            full_name=_FULL_NAME,
            bookings=[closed_booking],
        )

        with SessionFactory.using_database(self.database_key,
                                           autocommit=False) as session:
            session.add(person_no_match)
            session.add(person_no_open_bookings)
            session.add(person_match_full_name)
            session.commit()

            info = IngestInfo()
            info.create_person(full_name=_FULL_NAME, person_id=_EXTERNAL_ID)
            people = dao.read_people_with_open_bookings(
                session, _REGION, info.people)

            expected_people = [
                converter.convert_schema_object_to_entity(p)
                for p in [person_match_full_name]
            ]
            self.assertCountEqual(people, expected_people)
    def test_one_to_many(self):
        key_mapping_file = "../testdata/data_extractor/yaml/one_to_many.yaml"
        key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file)
        extractor = HtmlDataExtractor(key_mapping_file)

        expected_info = IngestInfo()
        charge = expected_info.create_person().create_booking().create_charge()
        charge.create_sentence(min_length="1 day", max_length="1 day")

        html_contents = html.fromstring("<td>Sentence Length</td><td>1 day</td>")
        info = extractor.extract_and_populate_data(html_contents)
        self.assertEqual(expected_info, info)
 def test_single_page_roster(self):
     """Tests that bookings are not treated as multi-key classes,
     i.e. we assume that a person has at most one booking if they are
     listed in columns."""
     expected_info = IngestInfo()
     p1 = expected_info.create_person(full_name="PERSON ONE", birthdate="1/1/1111")
     p1.create_booking(booking_id="NUMBER ONE")
     p2 = expected_info.create_person(full_name="PERSON TWO", birthdate="2/2/2222")
     p2.create_booking(booking_id="NUMBER TWO")
     p3 = expected_info.create_person(full_name="PERSON THREE", birthdate="3/3/3333")
     p3.create_booking(booking_id="NUMBER THREE")
     info = self.extract("single_page_roster.html", "single_page_roster.yaml")
     self.assertEqual(expected_info, info)
    def test_no_multi_key_parent(self):
        """Tests that parent classes are created properly when a field is
        scraped whose parent is a multi-key class that has not been scraped. In
        this example, charges are multi-key classes, but a bond is scraped from
        a booking with no charge information."""
        expected_info = IngestInfo()
        charge = expected_info.create_person().create_booking().create_charge()
        charge.create_bond(bond_id='1111')

        # The extractor will warn that 'Charge Description' cannot be found.
        # This is necessary because we need a field under multi_key_mappings
        # so that charge is treated as a multi_key class.
        info = self.extract('no_charges.html', 'charge_multi_key.yaml')
        self.assertEqual(expected_info, info)
    def test_persist_duplicates_to_db(self, mock_write, _mock_region,
                                      mock_session_return):
        """Tests that duplicate ingest_info.Person objects are merged before
        write."""
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        # Arrange
        ii = IngestInfo()
        ii.create_person(person_id=TEST_ID,
                         full_name=TEST_NAME) \
            .create_booking(booking_id=TEST_ID)

        ii_2 = IngestInfo()
        ii.create_person(person_id=TEST_ID2, full_name=TEST_NAME2)

        ii_1_dup = copy.deepcopy(ii)

        t1, t2, t3 = (Task(task_type=constants.TaskType.SCRAPE_DATA,
                           endpoint=TEST_ENDPOINT + str(i),
                           response_type=constants.ResponseType.TEXT)
                      for i in range(3))

        batch_persistence.write(ii, scrape_key, t1)
        batch_persistence.write(ii_2, scrape_key, t2)
        batch_persistence.write(ii_1_dup, scrape_key, t3)

        batch_persistence.persist_to_database(scrape_key.region_code,
                                              mock_session.start)

        expected_ii = IngestInfo(people=ii.people + ii_2.people)
        expected_proto = ingest_utils.convert_ingest_info_to_proto(expected_ii)
        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)
Пример #14
0
    def test_jailtracker_person(self):
        key_mapping_file = 'fixtures/jailtracker_person.yaml'
        key_mapping_file = os.path.join(os.path.dirname(__file__),
                                        key_mapping_file)
        extractor = JsonDataExtractor(key_mapping_file)

        expected_result = IngestInfo()
        expected_result.create_person(person_id='012345',
                                      birthdate='12/12/0001',
                                      age='2018',
                                      race='WHITE')
        result = extractor.extract_and_populate_data(_JT_PERSON)

        self.assertEqual(result, expected_result)
Пример #15
0
    def test_jailtracker_person(self) -> None:
        key_mapping_file = "fixtures/jailtracker_person.yaml"
        key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file)
        extractor = JsonDataExtractor(key_mapping_file)

        expected_result = IngestInfo()
        expected_result.create_person(
            person_id="012345", birthdate="12/12/0001", age="2018", race="WHITE"
        )
        result = extractor.extract_and_populate_data(
            fixtures.as_dict("extractor", "jailtracker_person.json")
        )

        self.assertEqual(result, expected_result)
Пример #16
0
    def test_persist_to_db_different_regions(self, mock_write, _mock_region,
                                             mock_session_return):
        scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        ii = IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        ii2 = IngestInfo()
        ii2.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME2).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        mock_session_1 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii, scrape_key1, t)
        expected_proto = ingest_utils.convert_ingest_info_to_proto(ii)
        batch_persistence.persist_to_database(scrape_key1.region_code,
                                              mock_session_1.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # We expect the region that we persisted to have no more ingest infos.
        ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session_1.start)
        self.assertEqual(len(ingest_infos_1), 0)

        mock_session_2 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii2, scrape_key2, t2)
        ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[1], mock_session_2.start)
        self.assertEqual(len(ingest_infos_2), 1)

        expected_proto = ingest_utils.convert_ingest_info_to_proto(ii2)
        batch_persistence.persist_to_database(scrape_key2.region_code,
                                              mock_session_2.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        self.assertEqual(mock_write.call_count, 2)
    def test_content_is_not_modified(self):
        """Tests that the HtmlDataExtractor does not mutate |content|."""
        key_mapping_file = "../testdata/data_extractor/yaml/text_label.yaml"
        key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file)
        extractor = HtmlDataExtractor(key_mapping_file)

        expected_info = IngestInfo()
        person = expected_info.create_person()
        person.birthdate = "1/1/1111"

        html_contents = html.fromstring("<html><div>DOB: 1/1/1111</div></html>")
        info = extractor.extract_and_populate_data(html_contents)

        self.assertEqual(expected_info, info)
        self.assertFalse(html_contents.cssselect("td"))
    def testParseColFail(self):
        expected_info = IngestInfo(people=[
            Person(person_id='100041685',
                   gender='M',
                   age='41',
                   race='AMERICAN INDIAN',
                   bookings=[
                       Booking(
                           booking_id='130877687',
                           admission_date='02/27/2020 14:51',
                           custody_status='IN CUSTODY',
                           facility='BERNALILLO COUNTY METRO DETENTION CENTER',
                           arrest=Arrest(agency='/BSO', ),
                           charges=[
                               Charge(
                                   offense_date='02/27/2020',
                                   name='FAIL TO COMPLY',
                                   case_number='D202CR201802134',
                               ),
                               Charge(
                                   offense_date='02/27/2020',
                                   name='AGGRAVATED DWI-3',
                                   case_number='D202CR201802134',
                               ),
                           ])
                   ]),
        ])

        with pytest.raises(DirectIngestError) as e:
            self.run_parse_file_test(expected_info, 'MDC_VERA_20200303_02')
        assert str(e.value) == "Found more columns than expected in charge row"
Пример #19
0
def write_ingest_info(
    region: str, task_hash: int, session_start_time: datetime, ingest_info: IngestInfo
) -> BatchIngestInfoData:
    """Writes a new ingest info for a given region.

    Args:
        region: (string) The region the ingest info is getting added for
        task_hash: (int) the hash of the task associated with the ingest info
        session_start_time: (datetime) The start time of the scraper that got
          the ingest info
        ingest_info: (IngestInfo) The ingest info data
    """
    logging.info(
        "Writing a new ingest info (with %d people) for region: [%s]",
        len(ingest_info.get_all_people()),
        region,
    )

    new_ingest_info_entity = _DatastoreIngestInfo.new(
        key=ds().key(INGEST_INFO_KIND),
        session_start_time=session_start_time,
        region=region,
        ingest_info=ingest_info,
        task_hash=task_hash,
    ).to_entity()

    try:
        retry_grpc(NUM_GRPC_RETRIES, ds().put, new_ingest_info_entity)
    except Exception as e:
        raise DatastoreWriteIngestInfoError(ingest_info, region) from e

    return _DatastoreIngestInfo.get_batch_ingest_info_data(new_ingest_info_entity)
Пример #20
0
    def test_partial_table(self) -> None:
        """Tests a page with a table as well as unstructured data."""
        expected_info = IngestInfo()
        person = expected_info.create_person()
        person.age = "38"
        person.place_of_residence = "WICHITA FALLS"
        person.race = "HISPANIC"
        booking = person.create_booking()
        booking.admission_date = "08/18/2017"
        charge = booking.create_charge()
        charge.name = "FIRST CHARGE"
        charge.charging_entity = "WICHITA FALLS PD"
        bond = charge.create_bond()
        bond.amount = "25,000.00"

        info = self.extract("partial_table.html", "partial_table.yaml")
        self.assertEqual(expected_info, info)
    def test_three_levels_multi_key(self):
        expected_info = IngestInfo()
        p = expected_info.create_person()
        b1 = p.create_booking(admission_date='01/01/2011',
                              release_date='02/02/2012')
        b1.create_charge(name='Charge1').create_bond(amount='$1.00',
                                                     bond_agent='AGENT 1')
        b2 = p.create_booking(admission_date='03/03/2013')
        b2.create_charge(name='Charge2').create_bond(amount='$2.00')
        b3 = p.create_booking(admission_date='03/03/2013')
        b3.create_charge(name='Charge3').create_bond(amount='$3.00')
        b4 = p.create_booking(admission_date='03/03/2013')
        b4.create_charge(name='Charge4').create_bond(amount='$4.00')

        info = self.extract('three_levels_multi_key.html',
                            'three_levels_multi_key.yaml')
        self.assertEqual(expected_info, info)
 def test_single_page_roster(self):
     """Tests that bookings are not treated as multi-key classes,
     i.e. we assume that a person has at most one booking if they are
     listed in columns."""
     expected_info = IngestInfo()
     p1 = expected_info.create_person(full_name='PERSON ONE',
                                      birthdate='1/1/1111')
     p1.create_booking(booking_id='NUMBER ONE')
     p2 = expected_info.create_person(full_name='PERSON TWO',
                                      birthdate='2/2/2222')
     p2.create_booking(booking_id='NUMBER TWO')
     p3 = expected_info.create_person(full_name='PERSON THREE',
                                      birthdate='3/3/3333')
     p3.create_booking(booking_id='NUMBER THREE')
     info = self.extract('single_page_roster.html',
                         'single_page_roster.yaml')
     self.assertEqual(expected_info, info)
Пример #23
0
    def test_three_levels_multi_key(self) -> None:
        expected_info = IngestInfo()
        p = expected_info.create_person()
        b1 = p.create_booking(admission_date="01/01/2011",
                              release_date="02/02/2012")
        b1.create_charge(name="Charge1").create_bond(amount="$1.00",
                                                     bond_agent="AGENT 1")
        b2 = p.create_booking(admission_date="03/03/2013")
        b2.create_charge(name="Charge2").create_bond(amount="$2.00")
        b3 = p.create_booking(admission_date="03/03/2013")
        b3.create_charge(name="Charge3").create_bond(amount="$3.00")
        b4 = p.create_booking(admission_date="03/03/2013")
        b4.create_charge(name="Charge4").create_bond(amount="$4.00")

        info = self.extract("three_levels_multi_key.html",
                            "three_levels_multi_key.yaml")
        self.assertEqual(expected_info, info)
    def test_partial_table(self):
        """Tests a page with a table as well as unstructured data."""
        expected_info = IngestInfo()
        person = expected_info.create_person()
        person.age = '38'
        person.place_of_residence = 'WICHITA FALLS'
        person.race = 'HISPANIC'
        booking = person.create_booking()
        booking.admission_date = '08/18/2017'
        charge = booking.create_charge()
        charge.name = 'FIRST CHARGE'
        charge.charging_entity = 'WICHITA FALLS PD'
        bond = charge.create_bond()
        bond.amount = '25,000.00'

        info = self.extract('partial_table.html', 'partial_table.yaml')
        self.assertEqual(expected_info, info)
Пример #25
0
 def test_labeled_fields(self) -> None:
     """Tests a page with field values in <span>s labeled by <label>s."""
     expected_info = IngestInfo()
     person = expected_info.create_person()
     person.person_id = "11111"
     person.race = "White"
     person.gender = "Male"
     booking = person.create_booking()
     booking.admission_date = "11/12/2018 5:04 PM"
     booking.facility = "Walla Walla County Corrections Department"
     charge = booking.create_charge()
     charge.name = "DUI"
     charge.offense_date = "9/21/2018 5:34 PM"
     charge.charge_class = "Gross Misdemeanor"
     charge.status = "Time Served"
     booking.charges.append(charge)
     info = self.extract("labeled_fields.html", "labeled_fields.yaml")
     self.assertEqual(expected_info, info)
 def test_labeled_fields(self):
     """Tests a page with field values in <span>s labeled by <label>s."""
     expected_info = IngestInfo()
     person = expected_info.create_person()
     person.person_id = '11111'
     person.race = 'White'
     person.gender = 'Male'
     booking = person.create_booking()
     booking.admission_date = '11/12/2018 5:04 PM'
     booking.facility = 'Walla Walla County Corrections Department'
     charge = booking.create_charge()
     charge.name = 'DUI'
     charge.offense_date = '9/21/2018 5:34 PM'
     charge.charge_class = 'Gross Misdemeanor'
     charge.status = 'Time Served'
     booking.charges.append(charge)
     info = self.extract('labeled_fields.html', 'labeled_fields.yaml')
     self.assertEqual(expected_info, info)
Пример #27
0
    def test_jailtracker_booking(self):
        key_mapping_file = 'fixtures/jailtracker_booking.yaml'
        key_mapping_file = os.path.join(os.path.dirname(__file__),
                                        key_mapping_file)

        extractor = JsonDataExtractor(key_mapping_file)

        expected_result = IngestInfo()
        expected_person = expected_result.create_person()
        expected_person.create_booking(booking_id='123098',
                                       admission_date='1/1/2001',
                                       release_date='1/1/2001')
        expected_person.create_booking(booking_id='123099',
                                       admission_date='1/1/2002',
                                       release_date='1/1/2002')

        result = extractor.extract_and_populate_data(_JT_BOOKING)

        self.assertEqual(result, expected_result)
    def test_sort(self):
        b1 = ingest_info.Booking(admission_date='1')
        b2 = ingest_info.Booking(admission_date='2')

        ii = IngestInfo(people=[ingest_info.Person(bookings=[b1, b2])])
        ii_reversed = IngestInfo(
            people=[ingest_info.Person(bookings=[b2, b1])])

        self.assertNotEqual(ii, ii_reversed)
        ii.sort()
        ii_reversed.sort()
        self.assertEqual(ii, ii_reversed)
Пример #29
0
    def test_person_with_charges(self):
        key_mapping_file = 'fixtures/person_with_charges.yaml'
        key_mapping_file = os.path.join(os.path.dirname(__file__),
                                        key_mapping_file)
        extractor = JsonDataExtractor(key_mapping_file)

        expected_result = IngestInfo()
        expected_person = expected_result.create_person(person_id='3245',
                                                        full_name='AAA AAAB',
                                                        race='BLACK')
        booking_1 = expected_person.create_booking(booking_id='324567',
                                                   admission_date='1/1/1111')
        booking_1.create_charge(charge_id='345309', name='charge name 1')
        booking_1.create_charge(charge_id='894303', name='charge name 2')
        booking_2 = expected_person.create_booking(booking_id='3245',
                                                   admission_date='2/2/2222')
        booking_2.create_charge(charge_id='42309', name='charge name 3')

        result = extractor.extract_and_populate_data(_PERSON_WITH_CHARGES)
        self.assertEqual(result, expected_result)
Пример #30
0
    def extract_and_populate_data(self,
                                  content: Union[Dict, List],
                                  ingest_info: IngestInfo = None):
        """This function does all the work of taking the users yaml file
        and content and returning a populated data class.  This function
        iterates through every field in the object and builds a model based on
        the keys that it sees.

        Args:
            content: An already parsed JSON object or array
            ingest_info: An IngestInfo object to use, if None we create a new
                one by default

        Returns:
            A populated ingest data model for a scrape.
        """
        if ingest_info is None:
            ingest_info = IngestInfo()
        self._extract(content, ingest_info, defaultdict(set))
        return ingest_info.prune()