def get_all_external_links(site_url):
    bsObj = get_soup(site_url)
    internal_links = get_internal_links(bsObj, split_address(site_url)[0])
    external_links = get_external_links(bsObj, split_address(site_url)[0])
    for link in external_links:
        if link not in all_ext_links:
            all_ext_links.add(link)
            print(link)
    for link in internal_links:
        if link not in all_int_links:
            print("即将获取的链接的url是:", link)
            all_int_links.add(link)
            get_all_external_links(link)
def get_random_externallink(startingPage):
    bsObj = get_soup(startingPage)
    externalLinks = get_external_links(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("No external links, looking around the site for one")
        domain = urlparse(startingPage).scheme + "://" + urlparse(
            startingPage).netloc
        internalLinks = get_internal_links(bsObj, domain)
        if len(internalLinks) == 0:
            print("No internal links! End script")
            return
        return get_random_externallink(internalLinks[random.randint(
            0,
            len(internalLinks) - 1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks) - 1)]
def get_links(pageURL):
    global pages
    req = get_requests("http://en.wikipedia.org" + pageURL)
    bsObj = get_soup(req=req)
    try:
        print(bsObj.h1.get_text())
        print(bsObj.find(id="mw-content-text").find("p"))
        print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError as e:
        print("页面缺少一些属性!不过不用担心")

    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        if "href" in link.attrs:
            if link.attrs["href"] not in pages:
                # 我们遇到了新页面
                newPage = link.attrs["href"]
                print("--------------------------\n" + newPage)
                pages.add(newPage)
                get_links(newPage)
def get_links(articleURL):
    req = get_requests("http://en.wikipedia.org" + articleURL)
    bsObj = get_soup(req=req, parse="html.parser")
    return bsObj.find("div", {"id": "bodyContent"}).findAll("a", href = re.compile("^(/wiki/)((?!:).)*$"))
Exemplo n.º 5
0
from common.my_urllib import get_soup
"""
    浏览器网络请求数据
    Host    https://www.google.com/
    Connection  keep-alive
    Accept  text/html, application/xhtml+xml, application/xml;q=0.9, image/webp, */*;q=0.8
    User-Agent  Mozilla/5.0 (Macintosh; Intel Mac Os X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36
    Referrer    https://www.google.com/
    Accept-Encoding     gzip, deflate, sdch
    Accept-Language     en-US,en;q=0.8
"""
"""
    Python请求头:
    Accept-Encoding     identify
    User-Agent      Python-urllib/3.6
"""

session = requests.Session()
headers = {
    "User-Agent":
    "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
    "Accept":
    "text/html, application/xhtml+xml, application/xml;q=0.9, image/webp, */*;q=0.8"
}

url = "https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending"

req = session.get(url, headers=headers)
bsObj = BeautifulSoup(req.text, "lxml")
bsObj = get_soup(url)
print(bsObj.find("table", {"class": "table-striped"}).get_text)
Exemplo n.º 6
0
# -*- coding: utf-8 -*-
#
# @Time    : 31/07/2018 12:29
#
# @Author  : WANG Wenxiao
#
# @FileName: ReWithBS2.4.py
#

from common.my_urllib import get_soup
import re

bsObj = get_soup("http://www.pythonscraping.com/pages/page3.html")

images = bsObj.findAll("img",
                       {"src": re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
    print(image, end="\t")
    print(image["src"])
"""
    2.5 获取属性
    到目前为止,我们已经介绍过如何获取和过滤标签,以及获取标签里的内容。
    但是,在网 络数据采集时你经常不需要查找标签的内容,而是需要查找标签属性。
    比如标签 <a> 指向 的 URL 链接包含在 href 属性中,或者 <img> 标签的图片文件包含在 src 属性中,这时获 取标签属性就变得非常有用了。
    对于一个标签对象,可以用下面的代码获取它的全部属性:
        myTag.attrs
    要注意这行代码返回的是一个 Python 字典对象,可以获取和操作这些属性。比如要获取图 片的资源位置 src,可以用下面这行代码:
        myImgTag.attrs["src"]
"""
"""
    2.6 Lambda 表达式
def download_all_img(url):
    imgpath = '/Users/shitong/Pictures/spyder/'
    bsObj = get_soup(url)
    title = bsObj.title.get_text().strip()