예제 #1
0
    def store_response(self, db: DatabaseHandler, download: dict,
                       response: Response) -> None:

        download = decode_object_from_bytes_if_needed(download)

        downloads_id = download['downloads_id']
        download_url = download['url']

        log.info(f"Handling download {downloads_id}...")
        log.debug(
            f"(URL of download {downloads_id} which is about to be handled: {download_url})"
        )

        if not response.is_success():
            log.info(
                f"Download {downloads_id} errored: {response.decoded_content()}"
            )
            self._store_failed_download_error_message(db=db,
                                                      download=download,
                                                      response=response)
            return

        supported_content_types_regex = re.compile(
            r'text|html|xml|rss|atom|application/json', flags=re.IGNORECASE)
        if re.search(supported_content_types_regex,
                     response.content_type() or ''):
            content = response.decoded_content()
        else:
            content = '(unsupported content type)'

        db.query(
            """
            UPDATE downloads
            SET url = %(download_url)s
            WHERE downloads_id = %(downloads_id)s
              AND url != %(download_url)s
        """, {
                'downloads_id': downloads_id,
                'download_url': download_url,
            })

        story_ids_to_extract = self.store_download(db=db,
                                                   download=download,
                                                   content=content)

        for stories_id in story_ids_to_extract:
            log.debug(
                f"Adding story {stories_id} for download {downloads_id} to extraction queue..."
            )
            JobBroker(
                queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
                    stories_id=stories_id)

        log.info(f"Handled download {downloads_id}...")
        log.debug(
            f"(URL of download {downloads_id} that was just handled: {download_url})"
        )
예제 #2
0
def __response_is_gzipped_data(response: Response) -> bool:
    """Return True if Response looks like it's gzipped."""
    url_path = str(furl(response.request().url()).path)
    content_type = response.content_type() or ''

    if url_path.lower().endswith('.gz') or 'gzip' in content_type.lower():
        return True

    else:
        return False
예제 #3
0
    def _store_failed_download_error_message(self, db: DatabaseHandler, download: dict, response: Response) -> None:
        """
        Deal with any errors returned by the fetcher response.

        If the error status looks like something that the site could recover from (503, 500 timeout), queue another time
        out using back off timing.  If we don't recognize the status as something we can recover from or if we have
        exceeded the max. retries, set the 'state' of the download to 'error' and set the 'error_messsage' to describe
        the error.
        """
        download = decode_object_from_bytes_if_needed(download)

        downloads_id = download['downloads_id']

        if response.is_success():
            # Hard error because only failed responses should reach this helper
            raise McCrawlerFetcherHardError("Download was successful, so nothing to handle")

        error_num = 1
        error = download.get('error_message', None)
        if error:
            error_num_match = re.search(r'\[error_num: (\d+)\]$', error)
            if error_num_match:
                error_num = int(error_num_match.group(1)) + 1
            else:
                error_num = 1

        error_message = f"{response.status_line()}\n[error_num: {error_num}]"

        responded_with_timeout = re.search(r'(503|500 read timeout)', response.status_line(), flags=re.IGNORECASE)
        if responded_with_timeout and error_num < self._MAX_5XX_RETRIES:
            db.query("""
                UPDATE downloads
                SET
                    state = 'pending',
                    download_time = NOW() + %(download_interval)s::interval,
                    error_message = %(error_message)s
                WHERE downloads_id = %(downloads_id)s
            """, {
                'download_interval': f"{error_num} hours",
                'error_message': error_message,
                'downloads_id': downloads_id,
            })

        else:
            db.query("""
                UPDATE downloads
                SET
                    state = 'error',
                    error_message = %(error_message)s
                WHERE downloads_id = %(downloads_id)s
            """, {
                'error_message': error_message,
                'downloads_id': downloads_id,
            })
예제 #4
0
def __solr_error_message_from_response(response: Response) -> str:
    """Parse out Solr error message from response."""

    if response.error_is_client_side():
        # UserAgent error (UserAgent wasn't able to connect to the server or something like that)
        error_message = f'UserAgent error: {response.decoded_content()}'

    else:

        status_code_str = str(response.code())

        if status_code_str.startswith('4'):
            # Client error - set default message
            error_message = f'Client error: {response.status_line()} {response.decoded_content()}'

            # Parse out Solr error message if there is one
            solr_response_maybe_json = response.decoded_content()
            if solr_response_maybe_json:

                solr_response_json = {}
                try:
                    solr_response_json = response.decoded_json()
                except Exception as ex:
                    log.debug(
                        f"Unable to parse Solr error response: {ex}; raw response: {solr_response_maybe_json}"
                    )

                error_message = solr_response_json.get('error',
                                                       {}).get('msg', {})
                request_params = solr_response_json.get('responseHeader',
                                                        {}).get('params', {})

                if error_message and request_params:
                    request_params_json = encode_json(request_params)

                    # If we were able to decode Solr error message, overwrite the default error message with it
                    error_message = f'Solr error: "{error_message}", params: {request_params_json}'

        elif status_code_str.startswith('5'):
            # Server error or some other error
            error_message = f'Server error: {response.status_line()} {response.decoded_content()}'

        else:
            # Some weird stuff
            error_message = f'Other error error: {response.status_line()} {response.decoded_content()}'

    return error_message
예제 #5
0
def ungzipped_response_content(response: Response) -> str:
    """Return HTTP response's decoded content, gunzip it if neccessary."""

    if __response_is_gzipped_data(response):
        gzipped_data = response.raw_data()
        try:
            data = gunzip(gzipped_data).decode('utf-8', errors='replace')
        except McGunzipException as ex:
            log.error("Unable to gunzip response {}: {}".format(response, ex))
            data = response.decoded_content()

    else:
        data = response.decoded_content()

    assert isinstance(data, str)

    return data