def startrequests(self): """ 获取起始页分类的链接和名称 """ json_comment = requests.get(self.url.format(self.pagenum), headers=self.headers, verify=False).text comments = json.loads(json_comment) commenttotal = comments.get('comment', 'NULL').get('commenttotal', 0) num = math.ceil(commenttotal / 25) # num = 10 for i in range(num): self.savestarturl.append(self.url.format(i)) get_content.delay(self.savestarturl)
def startrequests(self): '''获取起始页分类的链接和名称''' page = requests.get(self.start_url, headers=self.headers, verify=False, timeout=10).text page_content = etree.HTML(page) classifyurls = page_content.xpath( '//div[@class="wrap header"]/div/a/@href') titles = page_content.xpath('//div[@class="wrap header"]/div/a/text()') for title, url in zip(titles, classifyurls): item = {} item[title] = self.start_url + url self.savestarturl.append(item) get_content.delay(self.savestarturl[1:])
import time import requests from bs4 import BeautifulSoup from proj.tasks import get_content t1 = time.time() url = "http://www.wikidata.org/w/index.php?title=Special:WhatLinksHere/Q5limit=500&from=0" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML,like Gecko)\ Chrome/67.0.3396.87 Safari/537.36'} req = requests.get(url, headers=headers) soup = BeautifulSoup(req.text, "lxml") human_list = soup.find(id='mw-whatlinkshere-list')('li') urls = [] for human in human_list: url = human.find('a')['href'] urls.append('https://www.wikidata.org'+url) result = get_content.delay(urls) res = [v for v in result.collect()] for r in res: if isinstance(r[1], list) and isinstance(r[1][0], str): print(r[1]) t2 = time.time() print('耗时:%s' % (t2-t1))