def test_persist_duplicates_to_db(self, mock_write, _mock_region, mock_session_return): """Tests that duplicate ingest_info.Person objects are merged before write.""" mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) # Arrange ii = IngestInfo() ii.create_person(person_id=TEST_ID, full_name=TEST_NAME) \ .create_booking(booking_id=TEST_ID) ii_2 = IngestInfo() ii.create_person(person_id=TEST_ID2, full_name=TEST_NAME2) ii_1_dup = copy.deepcopy(ii) t1, t2, t3 = (Task(task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT + str(i), response_type=constants.ResponseType.TEXT) for i in range(3)) batch_persistence.write(ii, scrape_key, t1) batch_persistence.write(ii_2, scrape_key, t2) batch_persistence.write(ii_1_dup, scrape_key, t3) batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start) expected_ii = IngestInfo(people=ii.people + ii_2.people) expected_proto = ingest_utils.convert_ingest_info_to_proto(expected_ii) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto)
def test_write_to_datastore(self, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = IngestInfo() ii.create_person(full_name=TEST_NAME).create_booking( booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) task_hash = hash(json.dumps(t.to_serializable(), sort_keys=True)) expected_batch = BatchIngestInfoData(ingest_info=ii, task_hash=task_hash) batch_persistence.write(ii, scrape_key, t) batch_ingest_info_list = batch_persistence._get_batch_ingest_info_list( scrape_key.region_code, mock_session.start) self.assertEqual(len(batch_ingest_info_list), 1) self.assertEqual(expected_batch, batch_ingest_info_list[0])
def test_skip_empty(self): key_mapping_file = os.path.join(os.path.dirname(__file__), 'fixtures/skip_empty.yaml') extractor = JsonDataExtractor(key_mapping_file) expected = IngestInfo() expected.create_person( full_name='skip empty', bookings=[ Booking( custody_status='in custody', booking_id='1', charges=[ Charge(name="battery", ), Charge( name="assault", charge_class='misdemeanor', ), ], ), Booking( booking_id='2', charges=[ Charge( name='robbery', charge_class='felony', ), ], ), ], ) result = extractor.extract_and_populate_data(_SKIP_EMPTY) self.assertEqual(result, expected)
def test_persist_to_db(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = IngestInfo() ii.create_person(person_id=TEST_ID, full_name=TEST_NAME) \ .create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) batch_persistence.write(ii, scrape_key, t) expected_proto = ingest_utils.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # After we persist, there should no longer be ingest infos on Datastore ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session.start) self.assertEqual(len(ingest_infos), 0)
def test_persist_to_db_different_regions(self, mock_write, _mock_region, mock_session_return): scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) ii = IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) ii2 = IngestInfo() ii2.create_person( person_id=TEST_ID, full_name=TEST_NAME2).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) mock_session_1 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii, scrape_key1, t) expected_proto = ingest_utils.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key1.region_code, mock_session_1.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # We expect the region that we persisted to have no more ingest infos. ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session_1.start) self.assertEqual(len(ingest_infos_1), 0) mock_session_2 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii2, scrape_key2, t2) ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[1], mock_session_2.start) self.assertEqual(len(ingest_infos_2), 1) expected_proto = ingest_utils.convert_ingest_info_to_proto(ii2) batch_persistence.persist_to_database(scrape_key2.region_code, mock_session_2.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) self.assertEqual(mock_write.call_count, 2)
def testParse(self): region = regions.get_region('us_ma_middlesex', is_direct_ingest=True) controller = region.get_ingestor() metadata = IngestMetadata(region.region_code, region.jurisdiction_id, _FAKE_START_TIME, controller.get_enum_overrides()) ingest_info = UsMaMiddlesexParser().parse(_ROSTER_JSON) expected_info = IngestInfo() p1 = expected_info.create_person( person_id='12345 ', birthdate='1111-01-01 00:00:00.000', gender='M', ethnicity='HISPANIC', place_of_residence='123 ST DORCHESTER MA 01234 ') b1 = p1.create_booking(booking_id='1.0', admission_date='2017-01-01 00:00:00.000', admission_reason='BAIL MITTIMUS', facility='MAIN ') b1.create_charge(charge_id='1245.0', statute='90/24/K', name='OUI-LIQUOR, 2ND OFFENSE c90 ss24', case_number='111.0', court_type='Middlesex SC (81)', charge_notes='Other') b1.create_charge(charge_id='1502.0', offense_date='2017-01-28 00:00:00', statute='90/23/J', name='OUI while license suspended for OUI', case_number='222.0', court_type='Middlesex SC (81)', charge_notes='Drug or Alcohol', status='DISMISSED').create_bond(bond_id='12345.0') b1.create_hold(hold_id='00000.0', jurisdiction_name='Middlesex SC (81)') p2 = expected_info.create_person( person_id='10472 ', birthdate='1111-02-02 00:00:00.000', gender='M', race='BLACK or AFRICAN AMERICAN', place_of_residence='456 ST MALDEN MA 98765 ') b2 = p2.create_booking(booking_id='333.0', admission_date='2018-02-02 00:00:00.000', admission_reason='SENTENCE MITTIMUS', facility='MAIN ') b2.create_arrest(agency='Cambridge PD') b2.create_charge(charge_id='12341234.0', statute='269/10/J', name='FIREARM, CARRY WITHOUT LICENSE c269 ss10', case_number='555.0', charge_notes='Other', court_type='Cambridge DC (52)') self.validate_ingest(ingest_info, expected_info, metadata)
def test_readPeopleWithOpenBookings(self): admission_date = datetime.datetime(2018, 6, 20) release_date = datetime.date(2018, 7, 20) open_booking = Booking( custody_status=CustodyStatus.IN_CUSTODY.value, admission_date=admission_date, first_seen_time=admission_date, last_seen_time=admission_date, ) closed_booking = Booking( custody_status=CustodyStatus.RELEASED.value, admission_date=admission_date, release_date=release_date, first_seen_time=admission_date, last_seen_time=admission_date, ) person_no_match = Person( person_id=1, region=_REGION, jurisdiction_id=_JURISDICTION_ID, bookings=[deepcopy(open_booking)], ) person_match_full_name = Person( person_id=2, region=_REGION, jurisdiction_id=_JURISDICTION_ID, bookings=[deepcopy(open_booking)], full_name=_FULL_NAME, ) person_no_open_bookings = Person( person_id=6, region=_REGION, jurisdiction_id=_JURISDICTION_ID, full_name=_FULL_NAME, bookings=[closed_booking], ) with SessionFactory.using_database(self.database_key, autocommit=False) as session: session.add(person_no_match) session.add(person_no_open_bookings) session.add(person_match_full_name) session.commit() info = IngestInfo() info.create_person(full_name=_FULL_NAME, person_id=_EXTERNAL_ID) people = dao.read_people_with_open_bookings( session, _REGION, info.people) expected_people = [ converter.convert_schema_object_to_entity(p) for p in [person_match_full_name] ] self.assertCountEqual(people, expected_people)
def test_cell_ordering(self): """Tests that the HtmlDataExtractor handles 'th' and 'td' cells in the correct order.""" expected_info = IngestInfo() expected_info.create_person(birthdate='A') expected_info.create_person(birthdate='B') expected_info.create_person(birthdate='C') info = self.extract('mixed_cells.html', 'good_table.yaml') self.assertEqual(expected_info.people[0], info.people[0])
def test_cell_ordering(self) -> None: """Tests that the HtmlDataExtractor handles 'th' and 'td' cells in the correct order.""" expected_info = IngestInfo() expected_info.create_person(birthdate="A") expected_info.create_person(birthdate="B") expected_info.create_person(birthdate="C") info = self.extract("mixed_cells.html", "good_table.yaml") self.assertEqual(expected_info.people[0], info.people[0])
def test_single_page_roster(self): """Tests that bookings are not treated as multi-key classes, i.e. we assume that a person has at most one booking if they are listed in columns.""" expected_info = IngestInfo() p1 = expected_info.create_person(full_name="PERSON ONE", birthdate="1/1/1111") p1.create_booking(booking_id="NUMBER ONE") p2 = expected_info.create_person(full_name="PERSON TWO", birthdate="2/2/2222") p2.create_booking(booking_id="NUMBER TWO") p3 = expected_info.create_person(full_name="PERSON THREE", birthdate="3/3/3333") p3.create_booking(booking_id="NUMBER THREE") info = self.extract("single_page_roster.html", "single_page_roster.yaml") self.assertEqual(expected_info, info)
def test_jailtracker_person(self) -> None: key_mapping_file = "fixtures/jailtracker_person.yaml" key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = JsonDataExtractor(key_mapping_file) expected_result = IngestInfo() expected_result.create_person( person_id="012345", birthdate="12/12/0001", age="2018", race="WHITE" ) result = extractor.extract_and_populate_data( fixtures.as_dict("extractor", "jailtracker_person.json") ) self.assertEqual(result, expected_result)
def test_jailtracker_person(self): key_mapping_file = 'fixtures/jailtracker_person.yaml' key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = JsonDataExtractor(key_mapping_file) expected_result = IngestInfo() expected_result.create_person(person_id='012345', birthdate='12/12/0001', age='2018', race='WHITE') result = extractor.extract_and_populate_data(_JT_PERSON) self.assertEqual(result, expected_result)
def test_nested_good_table(self) -> None: """Tests a well modelled nested table.""" expected_info = IngestInfo() # Add person information person = expected_info.create_person() person.surname = "LAST NAME" person.birthdate = "06/03/2999" person.gender = "Male" person.age = "100000000" person.race = "White/Eurp/ N.Afr/Mid Eas" person.person_id = "18-00187" # Add booking information booking = person.create_booking() booking.booking_id = "18-00000" booking.admission_date = "1/05/2000 09:39" booking.create_hold(jurisdiction_name="District Court 13-3-01") # Add charge information charge = booking.create_charge() charge.statute = "901" charge.name = "Criminal Attempt [INCHOATE]" charge.case_number = "CR-000-2000" # Add bond information charge.create_bond().amount = "$1.00" info = self.extract("nested_good_table.html", "nested_good_table.yaml") self.assertEqual(expected_info, info)
def test_text_label(self) -> None: """Tests a page with a key/value pair in plain text.""" expected_info = IngestInfo() person = expected_info.create_person() person.birthdate = "12/25/0" person.race = "W" person.gender = "M" booking = person.create_booking() booking.booking_id = "202200000" booking.admission_date = "01/01/2001 19:44" booking.release_date = "11/01/2014" booking.total_bond_amount = "00000000" booking.facility = "Southwest Detention Center" arrest = booking.create_arrest() arrest.arrest_date = "01/01/2001 09:01" arrest.agency = "Hemet PD" charge1 = booking.create_charge() charge1.statute = "245(A)(1)" charge1.status = "DISM" charge1.name = "CHARGE 1" charge1.degree = "FEL" charge2 = booking.create_charge() charge2.statute = "245(A)(4)" charge2.status = "SENT" charge2.name = "CHARGE 2" charge2.degree = "FEL" bond2 = charge2.create_bond() bond2.amount = "$100" info = self.extract("text_label.html", "text_label.yaml") self.assertEqual(expected_info, info)
def sample_ingest_info(number: str) -> IngestInfo: ingest_info = IngestInfo() person = ingest_info.create_person() person.full_name = 'LAST NAME, FIRST NAME MIDDLE NAME' person.person_id = number return ingest_info
def test_text_label(self): """Tests a page with a key/value pair in plain text.""" expected_info = IngestInfo() person = expected_info.create_person() person.birthdate = '12/25/0' person.race = 'W' person.gender = 'M' booking = person.create_booking() booking.booking_id = '202200000' booking.admission_date = '01/01/2001 19:44' booking.release_date = '11/01/2014' booking.total_bond_amount = '00000000' booking.facility = 'Southwest Detention Center' arrest = booking.create_arrest() arrest.arrest_date = '01/01/2001 09:01' arrest.agency = 'Hemet PD' charge1 = booking.create_charge() charge1.statute = '245(A)(1)' charge1.status = 'DISM' charge1.name = 'CHARGE 1' charge1.degree = 'FEL' charge2 = booking.create_charge() charge2.statute = '245(A)(4)' charge2.status = 'SENT' charge2.name = 'CHARGE 2' charge2.degree = 'FEL' bond2 = charge2.create_bond() bond2.amount = '$100' info = self.extract('text_label.html', 'text_label.yaml') self.assertEqual(expected_info, info)
def test_nested_good_table(self): """Tests a well modelled nested table.""" expected_info = IngestInfo() # Add person information person = expected_info.create_person() person.surname = 'LAST NAME' person.birthdate = '06/03/2999' person.gender = 'Male' person.age = '100000000' person.race = 'White/Eurp/ N.Afr/Mid Eas' person.person_id = '18-00187' # Add booking information booking = person.create_booking() booking.booking_id = '18-00000' booking.admission_date = '1/05/2000 09:39' booking.create_hold(jurisdiction_name='District Court 13-3-01') # Add charge information charge = booking.create_charge() charge.statute = '901' charge.name = 'Criminal Attempt [INCHOATE]' charge.case_number = 'CR-000-2000' # Add bond information charge.create_bond().amount = '$1.00' info = self.extract('nested_good_table.html', 'nested_good_table.yaml') self.assertEqual(expected_info, info)
def test_good_table_with_link(self): """Tests a well modelled table with a link.""" expected_info = IngestInfo() person = expected_info.create_person() person.birthdate = '1/15/2048' info = self.extract('good_table_links.html', 'good_table.yaml') self.assertEqual(expected_info, info)
def test_single_page_roster(self): """Tests that bookings are not treated as multi-key classes, i.e. we assume that a person has at most one booking if they are listed in columns.""" expected_info = IngestInfo() p1 = expected_info.create_person(full_name='PERSON ONE', birthdate='1/1/1111') p1.create_booking(booking_id='NUMBER ONE') p2 = expected_info.create_person(full_name='PERSON TWO', birthdate='2/2/2222') p2.create_booking(booking_id='NUMBER TWO') p3 = expected_info.create_person(full_name='PERSON THREE', birthdate='3/3/3333') p3.create_booking(booking_id='NUMBER THREE') info = self.extract('single_page_roster.html', 'single_page_roster.yaml') self.assertEqual(expected_info, info)
def test_good_table(self) -> None: """Tests a well modelled table.""" expected_info = IngestInfo() person = expected_info.create_person() person.birthdate = "1/15/2048" info = self.extract("good_table.html", "good_table.yaml") self.assertEqual(expected_info, info)
def test_bond_multi_key(self): expected_info = IngestInfo() booking = expected_info.create_person().create_booking() booking.create_charge().create_bond(bond_id='1', amount='10') booking.create_charge().create_bond(bond_id='2', amount='20') booking.create_charge().create_bond(bond_id='3', amount='30') info = self.extract('bonds.html', 'bonds.yaml') self.assertEqual(expected_info, info)
def test_child_first(self): """Tests that in multi_key mappings (columns in a table), parent objects are created where needed.""" expected_info = IngestInfo() p = expected_info.create_person() p.create_booking(admission_date='111').create_charge(name='AAA') p.create_booking(admission_date='222').create_charge(name='BBB') info = self.extract('child_first.html', 'child_first.yaml') self.assertEqual(expected_info, info)
def test_child_first(self) -> None: """Tests that in multi_key mappings (columns in a table), parent objects are created where needed.""" expected_info = IngestInfo() p = expected_info.create_person() p.create_booking(admission_date="111").create_charge(name="AAA") p.create_booking(admission_date="222").create_charge(name="BBB") info = self.extract("child_first.html", "child_first.yaml") self.assertEqual(expected_info, info)
def test_th_rows(self): """Tests a yaml file with <th> keys in rows.""" expected_info = IngestInfo() person = expected_info.create_person() person.race = 'WHITE' person.gender = 'M' info = self.extract('th_rows.html', 'th_rows.yaml') self.assertEqual(expected_info, info)
def test_th_rows(self) -> None: """Tests a yaml file with <th> keys in rows.""" expected_info = IngestInfo() person = expected_info.create_person() person.race = "WHITE" person.gender = "M" info = self.extract("th_rows.html", "th_rows.yaml") self.assertEqual(expected_info, info)
def test_bond_multi_key(self) -> None: expected_info = IngestInfo() booking = expected_info.create_person().create_booking() booking.create_charge().create_bond(bond_id="1", amount="10") booking.create_charge().create_bond(bond_id="2", amount="20") booking.create_charge().create_bond(bond_id="3", amount="30") info = self.extract("bonds.html", "bonds.yaml") self.assertEqual(expected_info, info)
def test_persist_to_db_same_task_one_fail_one_pass(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) mock_write.return_value = True ii = IngestInfo() ii.create_person(person_id=TEST_ID, full_name=TEST_NAME) \ .create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) # Because the tasks are the same, we expect that to be counted as a # pass. t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) batch_persistence.write(ii, scrape_key, t) batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key) expected_proto = ingest_utils.convert_ingest_info_to_proto(ii) self.assertTrue( batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start)) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session.start) self.assertEqual(len(ingest_infos), 0)
def test_one_to_many(self): key_mapping_file = "../testdata/data_extractor/yaml/one_to_many.yaml" key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = HtmlDataExtractor(key_mapping_file) expected_info = IngestInfo() charge = expected_info.create_person().create_booking().create_charge() charge.create_sentence(min_length="1 day", max_length="1 day") html_contents = html.fromstring("<td>Sentence Length</td><td>1 day</td>") info = extractor.extract_and_populate_data(html_contents) self.assertEqual(expected_info, info)
def test_skip_empty(self) -> None: key_mapping_file = os.path.join( os.path.dirname(__file__), "fixtures/skip_empty.yaml" ) extractor = JsonDataExtractor(key_mapping_file) expected = IngestInfo() expected.create_person( full_name="skip empty", bookings=[ Booking( custody_status="in custody", booking_id="1", charges=[ Charge( name="battery", ), Charge( name="assault", charge_class="misdemeanor", ), ], ), Booking( booking_id="2", charges=[ Charge( name="robbery", charge_class="felony", ), ], ), ], ) result = extractor.extract_and_populate_data( fixtures.as_dict("extractor", "skip_empty.json") ) self.assertEqual(result, expected)
def test_persist_to_db_failed_no_write(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = IngestInfo() ii.create_person(person_id=TEST_ID, full_name=TEST_NAME) \ .create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) # Because the tasks are different, we should fail. t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, params=TEST_PARAMS, ) batch_persistence.write(ii, scrape_key, t) batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key) self.assertFalse( batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start)) self.assertEqual(mock_write.call_count, 0) # We should still have both items still on Datastore because they # weren't persisted. batch_ingest_info_data_list = batch_persistence \ ._get_batch_ingest_info_list(scrape_key.region_code, mock_session.start) self.assertEqual(len(batch_ingest_info_data_list), 2)