示例#1
0
def grabber(url,
            header=None,
            *,
            referer=None,
            cookie=None,
            raise_429=True,
            params=None,
            done=None):
    """Request url, return text or bytes of the content."""
    scheme, netloc, path, query, frag = urlsplit(url)

    if netloc not in sessions:
        s = requests.Session()
        s.headers.update(default_header)
        sessions[netloc] = s
    else:
        s = sessions[netloc]

    if header:
        s.headers.update(header)

    if referer:
        s.headers['referer'] = quote_unicode(referer)

    if cookie:
        quote_unicode_dict(cookie)
        requests.utils.add_dict_to_cookiejar(s.cookies, cookie)

    r = sync(do_request, s, url, params, raise_429)

    if done:
        done(s, r)

    return r
示例#2
0
def grabimg(*args, **kwargs):
    """Return byte array."""
    r = sync(grabber, *args, **kwargs)

    # find extension
    mime = None
    b = r.content
    if "Content-Type" in r.headers:
        mime = re.search("^(.*?)(;|$)", r.headers["Content-Type"]).group(1)
        mime = mime.strip()
    return get_ext(mime, b), b
示例#3
0
def grabhtml(*args, **kwargs):
    """Get html source of given url. Return String."""
    r = sync(grabber, *args, **kwargs)

    # decode to text
    match = re.search(br"charset=[\"']?([^\"'>]+)", r.content)
    if match:
        encoding = match.group(1).decode("latin-1")
        if encoding == "gb2312":
            encoding = "gbk"
        r.encoding = encoding

    return r.text