def get_all_external_links(site_url): bsObj = get_soup(site_url) internal_links = get_internal_links(bsObj, split_address(site_url)[0]) external_links = get_external_links(bsObj, split_address(site_url)[0]) for link in external_links: if link not in all_ext_links: all_ext_links.add(link) print(link) for link in internal_links: if link not in all_int_links: print("即将获取的链接的url是:", link) all_int_links.add(link) get_all_external_links(link)
def get_random_externallink(startingPage): bsObj = get_soup(startingPage) externalLinks = get_external_links(bsObj, urlparse(startingPage).netloc) if len(externalLinks) == 0: print("No external links, looking around the site for one") domain = urlparse(startingPage).scheme + "://" + urlparse( startingPage).netloc internalLinks = get_internal_links(bsObj, domain) if len(internalLinks) == 0: print("No internal links! End script") return return get_random_externallink(internalLinks[random.randint( 0, len(internalLinks) - 1)]) else: return externalLinks[random.randint(0, len(externalLinks) - 1)]
def get_links(pageURL): global pages req = get_requests("http://en.wikipedia.org" + pageURL) bsObj = get_soup(req=req) try: print(bsObj.h1.get_text()) print(bsObj.find(id="mw-content-text").find("p")) print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href']) except AttributeError as e: print("页面缺少一些属性!不过不用担心") for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")): if "href" in link.attrs: if link.attrs["href"] not in pages: # 我们遇到了新页面 newPage = link.attrs["href"] print("--------------------------\n" + newPage) pages.add(newPage) get_links(newPage)
def get_links(articleURL): req = get_requests("http://en.wikipedia.org" + articleURL) bsObj = get_soup(req=req, parse="html.parser") return bsObj.find("div", {"id": "bodyContent"}).findAll("a", href = re.compile("^(/wiki/)((?!:).)*$"))
from common.my_urllib import get_soup """ 浏览器网络请求数据 Host https://www.google.com/ Connection keep-alive Accept text/html, application/xhtml+xml, application/xml;q=0.9, image/webp, */*;q=0.8 User-Agent Mozilla/5.0 (Macintosh; Intel Mac Os X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 Referrer https://www.google.com/ Accept-Encoding gzip, deflate, sdch Accept-Language en-US,en;q=0.8 """ """ Python请求头: Accept-Encoding identify User-Agent Python-urllib/3.6 """ session = requests.Session() headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53", "Accept": "text/html, application/xhtml+xml, application/xml;q=0.9, image/webp, */*;q=0.8" } url = "https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending" req = session.get(url, headers=headers) bsObj = BeautifulSoup(req.text, "lxml") bsObj = get_soup(url) print(bsObj.find("table", {"class": "table-striped"}).get_text)
# -*- coding: utf-8 -*- # # @Time : 31/07/2018 12:29 # # @Author : WANG Wenxiao # # @FileName: ReWithBS2.4.py # from common.my_urllib import get_soup import re bsObj = get_soup("http://www.pythonscraping.com/pages/page3.html") images = bsObj.findAll("img", {"src": re.compile("\.\.\/img\/gifts\/img.*\.jpg")}) for image in images: print(image, end="\t") print(image["src"]) """ 2.5 获取属性 到目前为止,我们已经介绍过如何获取和过滤标签,以及获取标签里的内容。 但是,在网 络数据采集时你经常不需要查找标签的内容,而是需要查找标签属性。 比如标签 <a> 指向 的 URL 链接包含在 href 属性中,或者 <img> 标签的图片文件包含在 src 属性中,这时获 取标签属性就变得非常有用了。 对于一个标签对象,可以用下面的代码获取它的全部属性: myTag.attrs 要注意这行代码返回的是一个 Python 字典对象,可以获取和操作这些属性。比如要获取图 片的资源位置 src,可以用下面这行代码: myImgTag.attrs["src"] """ """ 2.6 Lambda 表达式
def download_all_img(url): imgpath = '/Users/shitong/Pictures/spyder/' bsObj = get_soup(url) title = bsObj.title.get_text().strip()