def store_response(self, db: DatabaseHandler, download: dict, response: Response) -> None: download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] download_url = download['url'] log.info(f"Handling download {downloads_id}...") log.debug( f"(URL of download {downloads_id} which is about to be handled: {download_url})" ) if not response.is_success(): log.info( f"Download {downloads_id} errored: {response.decoded_content()}" ) self._store_failed_download_error_message(db=db, download=download, response=response) return supported_content_types_regex = re.compile( r'text|html|xml|rss|atom|application/json', flags=re.IGNORECASE) if re.search(supported_content_types_regex, response.content_type() or ''): content = response.decoded_content() else: content = '(unsupported content type)' db.query( """ UPDATE downloads SET url = %(download_url)s WHERE downloads_id = %(downloads_id)s AND url != %(download_url)s """, { 'downloads_id': downloads_id, 'download_url': download_url, }) story_ids_to_extract = self.store_download(db=db, download=download, content=content) for stories_id in story_ids_to_extract: log.debug( f"Adding story {stories_id} for download {downloads_id} to extraction queue..." ) JobBroker( queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=stories_id) log.info(f"Handled download {downloads_id}...") log.debug( f"(URL of download {downloads_id} that was just handled: {download_url})" )
def __response_is_gzipped_data(response: Response) -> bool: """Return True if Response looks like it's gzipped.""" url_path = str(furl(response.request().url()).path) content_type = response.content_type() or '' if url_path.lower().endswith('.gz') or 'gzip' in content_type.lower(): return True else: return False
def _store_failed_download_error_message(self, db: DatabaseHandler, download: dict, response: Response) -> None: """ Deal with any errors returned by the fetcher response. If the error status looks like something that the site could recover from (503, 500 timeout), queue another time out using back off timing. If we don't recognize the status as something we can recover from or if we have exceeded the max. retries, set the 'state' of the download to 'error' and set the 'error_messsage' to describe the error. """ download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] if response.is_success(): # Hard error because only failed responses should reach this helper raise McCrawlerFetcherHardError("Download was successful, so nothing to handle") error_num = 1 error = download.get('error_message', None) if error: error_num_match = re.search(r'\[error_num: (\d+)\]$', error) if error_num_match: error_num = int(error_num_match.group(1)) + 1 else: error_num = 1 error_message = f"{response.status_line()}\n[error_num: {error_num}]" responded_with_timeout = re.search(r'(503|500 read timeout)', response.status_line(), flags=re.IGNORECASE) if responded_with_timeout and error_num < self._MAX_5XX_RETRIES: db.query(""" UPDATE downloads SET state = 'pending', download_time = NOW() + %(download_interval)s::interval, error_message = %(error_message)s WHERE downloads_id = %(downloads_id)s """, { 'download_interval': f"{error_num} hours", 'error_message': error_message, 'downloads_id': downloads_id, }) else: db.query(""" UPDATE downloads SET state = 'error', error_message = %(error_message)s WHERE downloads_id = %(downloads_id)s """, { 'error_message': error_message, 'downloads_id': downloads_id, })
def __solr_error_message_from_response(response: Response) -> str: """Parse out Solr error message from response.""" if response.error_is_client_side(): # UserAgent error (UserAgent wasn't able to connect to the server or something like that) error_message = f'UserAgent error: {response.decoded_content()}' else: status_code_str = str(response.code()) if status_code_str.startswith('4'): # Client error - set default message error_message = f'Client error: {response.status_line()} {response.decoded_content()}' # Parse out Solr error message if there is one solr_response_maybe_json = response.decoded_content() if solr_response_maybe_json: solr_response_json = {} try: solr_response_json = response.decoded_json() except Exception as ex: log.debug( f"Unable to parse Solr error response: {ex}; raw response: {solr_response_maybe_json}" ) error_message = solr_response_json.get('error', {}).get('msg', {}) request_params = solr_response_json.get('responseHeader', {}).get('params', {}) if error_message and request_params: request_params_json = encode_json(request_params) # If we were able to decode Solr error message, overwrite the default error message with it error_message = f'Solr error: "{error_message}", params: {request_params_json}' elif status_code_str.startswith('5'): # Server error or some other error error_message = f'Server error: {response.status_line()} {response.decoded_content()}' else: # Some weird stuff error_message = f'Other error error: {response.status_line()} {response.decoded_content()}' return error_message
def ungzipped_response_content(response: Response) -> str: """Return HTTP response's decoded content, gunzip it if neccessary.""" if __response_is_gzipped_data(response): gzipped_data = response.raw_data() try: data = gunzip(gzipped_data).decode('utf-8', errors='replace') except McGunzipException as ex: log.error("Unable to gunzip response {}: {}".format(response, ex)) data = response.decoded_content() else: data = response.decoded_content() assert isinstance(data, str) return data