def store_response(self, db: DatabaseHandler, download: dict, response: Response) -> None: download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] download_url = download['url'] log.info(f"Handling download {downloads_id}...") log.debug( f"(URL of download {downloads_id} which is about to be handled: {download_url})" ) if not response.is_success(): log.info( f"Download {downloads_id} errored: {response.decoded_content()}" ) self._store_failed_download_error_message(db=db, download=download, response=response) return supported_content_types_regex = re.compile( r'text|html|xml|rss|atom|application/json', flags=re.IGNORECASE) if re.search(supported_content_types_regex, response.content_type() or ''): content = response.decoded_content() else: content = '(unsupported content type)' db.query( """ UPDATE downloads SET url = %(download_url)s WHERE downloads_id = %(downloads_id)s AND url != %(download_url)s """, { 'downloads_id': downloads_id, 'download_url': download_url, }) story_ids_to_extract = self.store_download(db=db, download=download, content=content) for stories_id in story_ids_to_extract: log.debug( f"Adding story {stories_id} for download {downloads_id} to extraction queue..." ) JobBroker( queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=stories_id) log.info(f"Handled download {downloads_id}...") log.debug( f"(URL of download {downloads_id} that was just handled: {download_url})" )
def _store_failed_download_error_message(self, db: DatabaseHandler, download: dict, response: Response) -> None: """ Deal with any errors returned by the fetcher response. If the error status looks like something that the site could recover from (503, 500 timeout), queue another time out using back off timing. If we don't recognize the status as something we can recover from or if we have exceeded the max. retries, set the 'state' of the download to 'error' and set the 'error_messsage' to describe the error. """ download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] if response.is_success(): # Hard error because only failed responses should reach this helper raise McCrawlerFetcherHardError("Download was successful, so nothing to handle") error_num = 1 error = download.get('error_message', None) if error: error_num_match = re.search(r'\[error_num: (\d+)\]$', error) if error_num_match: error_num = int(error_num_match.group(1)) + 1 else: error_num = 1 error_message = f"{response.status_line()}\n[error_num: {error_num}]" responded_with_timeout = re.search(r'(503|500 read timeout)', response.status_line(), flags=re.IGNORECASE) if responded_with_timeout and error_num < self._MAX_5XX_RETRIES: db.query(""" UPDATE downloads SET state = 'pending', download_time = NOW() + %(download_interval)s::interval, error_message = %(error_message)s WHERE downloads_id = %(downloads_id)s """, { 'download_interval': f"{error_num} hours", 'error_message': error_message, 'downloads_id': downloads_id, }) else: db.query(""" UPDATE downloads SET state = 'error', error_message = %(error_message)s WHERE downloads_id = %(downloads_id)s """, { 'error_message': error_message, 'downloads_id': downloads_id, })