def testWrite_SingleCountBadDataFails(self): def test_db_empty(): query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) self.assertEqual(query.all(), []) with self.assertRaises(ValueError): store_single_count(SingleCount(count=311), '1001001') test_db_empty() with self.assertRaises(EnumParsingError): store_single_count( SingleCount(count=311, ethnicity='Not an Ethnicity'), '01001001') test_db_empty() with self.assertRaises(EnumParsingError): store_single_count(SingleCount(count=311, gender='Not a Gender'), '01001001') test_db_empty() with self.assertRaises(EnumParsingError): store_single_count(SingleCount(count=311, race='Not a Race'), '01001001') test_db_empty() with self.assertRaises(ValueError): store_single_count(SingleCount(count=311, date='Not a date'), '01001001') test_db_empty()
def testWrite_SingleCountBadDataFails(self): def test_db_empty(): with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(SingleCountAggregate) self.assertEqual(query.all(), []) with self.assertRaises(ValueError): store_single_count(SingleCount(count=311), "1001001") test_db_empty() with self.assertRaises(EnumParsingError): store_single_count( SingleCount(count=311, ethnicity="Not an Ethnicity"), "01001001") test_db_empty() with self.assertRaises(EnumParsingError): store_single_count(SingleCount(count=311, gender="Not a Gender"), "01001001") test_db_empty() with self.assertRaises(EnumParsingError): store_single_count(SingleCount(count=311, race="Not a Race"), "01001001") test_db_empty() with self.assertRaises(ValueError): store_single_count(SingleCount(count=311, date="Not a date"), "01001001") test_db_empty()
def testWrite_SingleCountWithDate(self): store_single_count(SingleCount(count=_COUNT, date=_TODAY), '01001001') query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) result = one(query.all()) self.assertEqual(result.jid, _JID) self.assertEqual(result.count, _COUNT) self.assertEqual(result.date, _TODAY)
def testWrite_SingleCountWithRace(self): store_single_count(SingleCount(count=_COUNT, race=Race.ASIAN), '01001001') query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) result = one(query.all()) self.assertEqual(result.jid, _JID) self.assertEqual(result.count, _COUNT) self.assertEqual(Race(result.race), Race.ASIAN)
def testWrite_SingleCountWithGender(self): store_single_count(SingleCount(count=_COUNT, gender=Gender.FEMALE), '01001001') query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) result = one(query.all()) self.assertEqual(result.jid, _JID) self.assertEqual(result.count, _COUNT) self.assertEqual(Gender(result.gender), Gender.FEMALE)
def testWrite_SingleCountWithDate(self): store_single_count(SingleCount(count=_COUNT, date=_TODAY), "01001001") with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(SingleCountAggregate) result = one(query.all()) self.assertEqual(result.jid, _JID) self.assertEqual(result.count, _COUNT) self.assertEqual(result.date, _TODAY)
def testWrite_SingleCountWithEthnicity(self): store_single_count( SingleCount(count=_COUNT, ethnicity=Ethnicity.HISPANIC), '01001001') query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) result = one(query.all()) self.assertEqual(result.jid, _JID) self.assertEqual(result.count, _COUNT) self.assertEqual(Ethnicity(result.ethnicity), Ethnicity.HISPANIC)
def testWrite_SingleCountWithGender(self): store_single_count(SingleCount(count=_COUNT, gender=Gender.FEMALE), "01001001") with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(SingleCountAggregate) result = one(query.all()) self.assertEqual(result.jid, _JID) self.assertEqual(result.count, _COUNT) self.assertEqual(Gender(result.gender), Gender.FEMALE)
def testWrite_SingleCountWithEthnicity(self): store_single_count( SingleCount(count=_COUNT, ethnicity=Ethnicity.HISPANIC), "01001001") with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(SingleCountAggregate) result = one(query.all()) self.assertEqual(result.jid, _JID) self.assertEqual(result.count, _COUNT) self.assertEqual(Ethnicity(result.ethnicity), Ethnicity.HISPANIC)
def store_single_count_endpoint(): """Endpoint to store a single count""" jid = get_str_param_value('jid', request.args) ethnicity = get_str_param_value('ethnicity', request.args) gender = get_str_param_value('gender', request.args) race = get_str_param_value('race', request.args) count = get_str_param_value('count', request.args) date = get_str_param_value('date', request.args) sc = SingleCount( count=count, ethnicity=ethnicity, gender=gender, race=race, date=date, ) stored = store_single_count(sc, jid) if stored: logging.info("Stored [%d] as [%s] for [%s]", count, ' '.join(filter(None, (race, gender, ethnicity))), jid) return '', HTTPStatus.OK logging.error("Failed to store single count for [%s]", jid) return '', HTTPStatus.INTERNAL_SERVER_ERROR
def _generic_scrape(self, request: QueueRequest): """ General handler for all scrape tasks. This function is a generic entry point into all types of scrapes. It decides what to call based on params. Args: params: dict of parameters passed from the last scrape session. """ try: task = request.next_task # Here we handle a special case where we weren't really sure # we were going to get data when we submitted a task, but then # we ended up with data, so no more requests are required, # just the content we already have. # TODO(#680): remove this if task.content is not None: content = self._parse_html_content(task.content) cookies = None else: post_data = task.post_data # Let the child transform the post_data if it wants before # sending the requests. This hook is in here in case the # child did something like compress the post_data before # it put it on the queue. self.transform_post_data(post_data) # We always fetch some content before doing anything. # Note that we use get here for the post_data to return a # default value of None if this scraper doesn't set it. try: content, cookies = self._fetch_content( task.endpoint, task.response_type, headers=task.headers, cookies=task.cookies, params=task.params, post_data=post_data, json_data=task.json) except Exception as e: raise ScraperFetchError(str(e)) from e scraped_data = None if self.should_scrape_data(task.task_type): # If we want to scrape data, we should either create an # ingest_info object or get the one that already exists. logging.info("Scraping data for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) try: scraped_data = self.populate_data( content, task, request.ingest_info or IngestInfo()) except Exception as e: raise ScraperPopulateDataError(str(e)) from e if self.should_get_more_tasks(task.task_type): logging.info("Getting more tasks for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) # Only send along ingest info if it will not be persisted now. ingest_info_to_send = None if scraped_data is not None and not scraped_data.persist: ingest_info_to_send = scraped_data.ingest_info try: # pylint: disable=assignment-from-no-return next_tasks = self.get_more_tasks(content, task) except Exception as e: raise ScraperGetMoreTasksError(str(e)) from e for next_task in next_tasks: # Include cookies received from response, if any if cookies: cookies.update(next_task.cookies) next_task = Task.evolve(next_task, cookies=cookies) self.add_task( '_generic_scrape', QueueRequest( scrape_type=request.scrape_type, scraper_start_time=request.scraper_start_time, next_task=next_task, ingest_info=ingest_info_to_send, )) if scraped_data is not None and scraped_data.persist: if scraped_data.ingest_info: logging.info("Logging at most 4 people (were %d):", len(scraped_data.ingest_info.people)) loop_count = min(len(scraped_data.ingest_info.people), constants.MAX_PEOPLE_TO_LOG) for i in range(loop_count): logging.info("[%s]", str(scraped_data.ingest_info.people[i])) logging.info("Last seen time of person being set as: [%s]", request.scraper_start_time) metadata = IngestMetadata(self.region.region_code, self.region.jurisdiction_id, request.scraper_start_time, self.get_enum_overrides()) if self.BATCH_WRITES: logging.info( "Queuing ingest_info ([%d] people) to " "batch_persistence for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write( ingest_info=scraped_data.ingest_info, scrape_key=scrape_key, task=task, ) else: logging.info( "Writing ingest_info ([%d] people) to the database" " for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) persistence.write( ingest_utils.convert_ingest_info_to_proto( scraped_data.ingest_info), metadata) for sc in scraped_data.single_counts: if not sc.date: scrape_key = ScrapeKey(self.region.region_code, constants.ScrapeType.BACKGROUND) session = sessions.get_current_session(scrape_key) if session: sc = attr.evolve(sc, date=session.start.date()) single_count.store_single_count( sc, self.region.jurisdiction_id) except Exception as e: if self.BATCH_WRITES: scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write_error( error=str(e), trace_id=get_trace_id_from_flask(), task=task, scrape_key=scrape_key, ) raise e