Exemplo n.º 1
0
 def test_copy_preserves_encoding(self):
     soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
     encoding = soup.original_encoding
     copy = soup.__copy__()
     assert "<p> </p>" == str(copy)
     assert encoding == copy.original_encoding
Exemplo n.º 2
0
url = 'https://zh.moegirl.org/%E7%99%BD%E5%AD%A6'
url = quote(
    url,
    safe=string.printable)  # this will helps solving the chinese url problem
# create a http reader
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
# request
response = http.request(
    'get', url
)  # this will cost some time for opening the pages (put this inside a thread worker)
if response.status == 200:  # succeed status:200
    print('Oh,yeah!')
    pass
# create a beautiful soup object
bsObj = BeautifulSoup(response.data, "html.parser")
d = bsObj.__copy__()
parseURL = parse.urlparse(url)
# get internal links (starts with / or contains the same net location)
currentURL = parseURL.scheme + '://' + parseURL.netloc + parseURL.path
links = bsObj.findAll("a", href=re.compile("^(/|.*" + currentURL + ")"))
# write them
internalLinks = []
for link in links:
    if link.attrs['href'] is not None and link.attrs[
            'href'] not in internalLinks:
        if link.attrs['href'].startswith("/"):
            internalLinks.append(currentURL + link.attrs['href'])
        else:
            internalLinks.append(link.attrs['href'])
print('len(internalLinks):' + str(len(internalLinks)))
# save only the content