def __init__(self, ingest_info: IngestInfo, region: str): msg_template = ( "Error when writing to Datastore ingest info '{}' for" " region {}. Trace id: {}" ) msg = msg_template.format(ingest_info, region, get_trace_id_from_flask()) super().__init__(msg)
def _generic_scrape(self, request: QueueRequest): """ General handler for all scrape tasks. This function is a generic entry point into all types of scrapes. It decides what to call based on params. Args: params: dict of parameters passed from the last scrape session. """ try: task = request.next_task # Here we handle a special case where we weren't really sure # we were going to get data when we submitted a task, but then # we ended up with data, so no more requests are required, # just the content we already have. # TODO(#680): remove this if task.content is not None: content = self._parse_html_content(task.content) cookies = None else: post_data = task.post_data # Let the child transform the post_data if it wants before # sending the requests. This hook is in here in case the # child did something like compress the post_data before # it put it on the queue. self.transform_post_data(post_data) # We always fetch some content before doing anything. # Note that we use get here for the post_data to return a # default value of None if this scraper doesn't set it. try: content, cookies = self._fetch_content( task.endpoint, task.response_type, headers=task.headers, cookies=task.cookies, params=task.params, post_data=post_data, json_data=task.json) except Exception as e: raise ScraperFetchError(str(e)) from e scraped_data = None if self.should_scrape_data(task.task_type): # If we want to scrape data, we should either create an # ingest_info object or get the one that already exists. logging.info("Scraping data for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) try: scraped_data = self.populate_data( content, task, request.ingest_info or IngestInfo()) except Exception as e: raise ScraperPopulateDataError(str(e)) from e if self.should_get_more_tasks(task.task_type): logging.info("Getting more tasks for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) # Only send along ingest info if it will not be persisted now. ingest_info_to_send = None if scraped_data is not None and not scraped_data.persist: ingest_info_to_send = scraped_data.ingest_info try: # pylint: disable=assignment-from-no-return next_tasks = self.get_more_tasks(content, task) except Exception as e: raise ScraperGetMoreTasksError(str(e)) from e for next_task in next_tasks: # Include cookies received from response, if any if cookies: cookies.update(next_task.cookies) next_task = Task.evolve(next_task, cookies=cookies) self.add_task( '_generic_scrape', QueueRequest( scrape_type=request.scrape_type, scraper_start_time=request.scraper_start_time, next_task=next_task, ingest_info=ingest_info_to_send, )) if scraped_data is not None and scraped_data.persist: if scraped_data.ingest_info: logging.info("Logging at most 4 people (were %d):", len(scraped_data.ingest_info.people)) loop_count = min(len(scraped_data.ingest_info.people), constants.MAX_PEOPLE_TO_LOG) for i in range(loop_count): logging.info("[%s]", str(scraped_data.ingest_info.people[i])) logging.info("Last seen time of person being set as: [%s]", request.scraper_start_time) metadata = IngestMetadata(self.region.region_code, self.region.jurisdiction_id, request.scraper_start_time, self.get_enum_overrides()) if self.BATCH_WRITES: logging.info( "Queuing ingest_info ([%d] people) to " "batch_persistence for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write( ingest_info=scraped_data.ingest_info, scrape_key=scrape_key, task=task, ) else: logging.info( "Writing ingest_info ([%d] people) to the database" " for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) persistence.write( ingest_utils.convert_ingest_info_to_proto( scraped_data.ingest_info), metadata) for sc in scraped_data.single_counts: if not sc.date: scrape_key = ScrapeKey(self.region.region_code, constants.ScrapeType.BACKGROUND) session = sessions.get_current_session(scrape_key) if session: sc = attr.evolve(sc, date=session.start.date()) single_count.store_single_count( sc, self.region.jurisdiction_id) except Exception as e: if self.BATCH_WRITES: scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write_error( error=str(e), trace_id=get_trace_id_from_flask(), task=task, scrape_key=scrape_key, ) raise e
def __init__(self, error: str, region: str): msg_template = ( "Error when writing to Datastore error '{}' for region {}. Trace id: {}" ) msg = msg_template.format(error, region, get_trace_id_from_flask()) super().__init__(msg)