Пример #1
0
def transformed_response_body(
        response: Response,
        html_transform: Callable[[BeautifulSoup, str, ProxyUrl], None],
        proxy_url: ProxyUrl) -> Tuple[bool, bytes]:

    body = response.body or b''
    content_type = (response.headers or {}).get('content-type', '')
    if content_type.startswith('text/html'):
        encoding = http_content_type_encoding(content_type)
        try:
            base_url = get_base_url(body, response.url, encoding)
        except UnicodeDecodeError:
            base_url = response.url
        soup = BeautifulSoup(body, 'lxml', from_encoding=encoding)
        html_transform(
            soup, base_url=base_url, proxy_url=proxy_url)
        head = soup.find('head')
        if head:
            head.append(soup.new_tag('meta', charset='utf8'))
        return True, soup.encode('utf8')
    elif content_type.startswith('text/css'):
        css_source = body.decode('utf8', 'ignore')
        return (False, process_css(
            css_source, base_uri=response.url, proxy_url=proxy_url)
                .encode('utf8'))
    else:
        return False, body
Пример #2
0
 def _headers_encoding(self):
     """
     从headers获取头部charset编码
     """
     content_type = self.headers.get("Content-Type") or self.headers.get(
         "content-type")
     return (http_content_type_encoding(content_type) or "utf-8"
             if "application/json" in content_type else None)
Пример #3
0
    def process_request_unsafe(self, request, spider ):
        spider.session.visit(request.url)
        spider.session.wait()

        body = spider.session.body()
        headers = spider.session.headers()
        headers = dict((str(k),headers[k]) for k in headers)

        encoding = http_content_type_encoding(headers.get("Content-Type"))
        if encoding is None:
            encoding = http_content_type_encoding(body)

        if encoding is None:
            encoding = 'utf-8'

        if body is None:
            return

        return HtmlResponse( spider.session.url(), body=body, encoding=encoding, headers=dict((str(k),headers[k]) for k in headers) )
Пример #4
0
    def __call__(self, session, url, *args, **kwargs):
        self.session = session
        session.visit(url)
        session.wait()

        body = session.body()
        headers = session.headers()
        headers = dict((k,headers[k]) for k in headers)
        content_type_header = headers.get("Content-Type")
        encoding = http_content_type_encoding(content_type_header)

        return HtmlPage(session.url(), headers=headers, body=body, encoding=encoding)
Пример #5
0
def download_url(url: str, timeout: int=None) -> str:
    """Download the contents of a remote url and return the text"""
    from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
    timeout = timeout or TIMEOUT
    response = requests.get(
        url,
        headers={'User-Agent': WGET_USER_AGENT},
        verify=CHECK_SSL_VALIDITY,
        timeout=timeout,
    )

    content_type = response.headers.get('Content-Type', '')
    encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)

    if encoding is not None:
        response.encoding = encoding

    return response.text
Пример #6
0
 def _headers_encoding(self):
     content_type = self.headers.get(b'Content-Type', b'')
     return http_content_type_encoding(to_native_str(content_type))
Пример #7
0
 def _headers_encoding(cls,response):
     """
         根据content-type查看编码类型
     """
     content_type = response.headers.get('Content-Type')
     return http_content_type_encoding(content_type)  
Пример #8
0
 def _headers_encoding(self):
     content_type = self.headers.get(b'Content-Type', b'')
     return http_content_type_encoding(to_native_str(content_type))
Пример #9
0
 def _infer_encoding_from_content_type(self):
     content_type = self.headers.get("Content-Type")
     if content_type:
         return http_content_type_encoding(content_type)
     return None
Пример #10
0
 def _headers_encoding(self):
     content_type = self.headers.get('Content-Type')
     return http_content_type_encoding(content_type)
Пример #11
0
 def _headers_encoding(self):
     content_type = self.headers.get(b"Content-Type", b"")
     return http_content_type_encoding(to_unicode(content_type))
Пример #12
0
 def _headers_encoding(cls, response):
     """
         根据content-type查看编码类型
     """
     content_type = response.headers.get('Content-Type')
     return http_content_type_encoding(content_type)
Пример #13
0
 def _headers_encoding(self):
     content_type = self.headers.get('Content-Type')
     return http_content_type_encoding(content_type)
Пример #14
0
 def test_http_encoding_header(self):
     header_value = "Content-Type: text/html; charset=ISO-8859-4"
     extracted = http_content_type_encoding(header_value)
     self.assertEqual(extracted, "iso8859-4")
     self.assertEqual(None, http_content_type_encoding("something else"))
Пример #15
0
 def test_http_encoding_header(self):
     header_value = "Content-Type: text/html; charset=ISO-8859-4"
     extracted = http_content_type_encoding(header_value)
     self.assertEqual(extracted, "iso8859-4")
     self.assertEqual(None, http_content_type_encoding("something else"))
Пример #16
0
def guess_coding(body):
    return http_content_type_encoding(f'charset={chardet.detect(body)["encoding"]}')