示例#1
0
def isMultiPaged(url):
    html_page1 = common.download(url % 1)
    tree = etree.HTML(html_page1)
    xp1 = tree.xpath("/html/body/div[1]/div[1]/div[1]/div[2]/*")
    xp1 = ",".join(x.text for x in xp1)
    html_page2 = common.download(url % 2)
    if html_page2 == None:
        return False
    tree = etree.HTML(html_page2)
    xp2 = tree.xpath("/html/body/div[1]/div[1]/div[1]/div[2]/*")
    xp2 = ",".join(x.text for x in xp2)
    if xp1 == xp2:
        return False
    else:
        return True
示例#2
0
文件: spiderbs4.py 项目: Vwan/Python
def isMultiPaged(url):
    html_page1 = common.download(url % 1)
    soup = BeautifulSoup(html_page1, 'html.parser')
    body1 = soup.find('body')
    body1.script.decompose()

    html_page2 = common.download(url % 2)
    if html_page2 == None:
        return False
    soup = BeautifulSoup(html_page2, "html.parser")
    body2 = soup.find('body')
    #print [x.extract() for x in body2.findAll('script') ]
    body2.script.decompose()
    if str(body1) == str(body2):
        return False
    else:
        return True
示例#3
0
文件: spiderbs4.py 项目: Vwan/Python
def crawled_page(crawled_url):
    html = common.download(crawled_url)
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('h1', {'class': 'title'})
    if title == None:
        return "Title_Is_None", crawled_url
    content = soup.find('div', {'class': 'show-content'})
    if content == None:
        return title.text, "Content_Is_None"
    return title.text, content.text
示例#4
0
文件: spiderbs4.py 项目: Vwan/Python
def getNumberOfPages(url):
    count = 1
    flag = True
    if (isMultiPaged(url)):
        while flag:
            url = url % count
            # print "url: %s" % url
            count += 1
            html = common.download(url)
            if html == None:
                break
    return count
示例#5
0
def crawled_page(crawled_url):
    html = common.download(crawled_url)
    tree = etree.HTML(html)
    title = tree.xpath("/html/body/div[1]/div[1]/div[1]/h1")
    if title == None or len(title) == 0:
        return "Title_Is_None", crawled_url
    contents = tree.xpath("/html/body/div[1]/div[1]/div[1]/div[2]/*")
    if contents == None or len(contents) == 0:
        return title.text, "Content_Is_None"
    content = ''
    for x in contents:
        if (x.text != None):
            content = content + x.xpath('string()')
    return title[0].text, content
示例#6
0
文件: spiderbs4.py 项目: Vwan/Python
def a_links(url_seed, attrs={}):
    html = common.download(url_seed)
    soup = BeautifulSoup(html, 'html.parser')
    alinks = soup.find_all('a', attrs)
    return alinks
示例#7
0
def a_links(url_seed, attrs={}):
    html = common.download(url_seed)
    tree = etree.HTML(html)
    alinks = tree.xpath("//a")
    return alinks