示例#1
0
    def startrequests(self):
        """
        获取起始页分类的链接和名称
        """
        json_comment = requests.get(self.url.format(self.pagenum),
                                    headers=self.headers,
                                    verify=False).text
        comments = json.loads(json_comment)

        commenttotal = comments.get('comment', 'NULL').get('commenttotal', 0)
        num = math.ceil(commenttotal / 25)

        # num = 10
        for i in range(num):
            self.savestarturl.append(self.url.format(i))

        get_content.delay(self.savestarturl)
示例#2
0
    def startrequests(self):
        '''获取起始页分类的链接和名称'''
        page = requests.get(self.start_url,
                            headers=self.headers,
                            verify=False,
                            timeout=10).text
        page_content = etree.HTML(page)
        classifyurls = page_content.xpath(
            '//div[@class="wrap header"]/div/a/@href')
        titles = page_content.xpath('//div[@class="wrap header"]/div/a/text()')

        for title, url in zip(titles, classifyurls):
            item = {}
            item[title] = self.start_url + url
            self.savestarturl.append(item)

        get_content.delay(self.savestarturl[1:])
示例#3
0
文件: scrapy.py 项目: hsy0601/proj
import time
import requests
from bs4 import BeautifulSoup
from proj.tasks import get_content

t1 = time.time()
url = "http://www.wikidata.org/w/index.php?title=Special:WhatLinksHere/Q5limit=500&from=0"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML,like Gecko)\
           Chrome/67.0.3396.87 Safari/537.36'}
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, "lxml")
human_list = soup.find(id='mw-whatlinkshere-list')('li')

urls = []
for human in human_list:
    url = human.find('a')['href']
    urls.append('https://www.wikidata.org'+url)

result = get_content.delay(urls)
res = [v for v in result.collect()]
for r in res:
    if isinstance(r[1], list) and isinstance(r[1][0], str):
        print(r[1])

t2 = time.time()
print('耗时:%s' % (t2-t1))