def crawl_detail_with_url(url, **kwargs): print('论文详情url地址:' + url) response = send_request(url) print('论文详情请求成功' + response.url) if response and url == response.url: title = kwargs['title'] print('详情请求状态码', title, kwargs, response.status_code) app.send_task('tasks.parse_detail_data', args=(response.text, kwargs))
def manage_crawl_task(): while True: shop = r.lpop("shop") if shop: shop = shop.decode() shop = json.loads(shop) content = app.send_task('tasks.crawl', args=(shop, )) # content = app.send_task('tasks.crawl', kwargs=shop) print(content) time.sleep(0.001)
def manage_crawl_task(urls): for url in urls: app.send_task('tasks.crawl', args=(url, ))
def parse_page_data(response): """ 从旧网站中获取获取每一个分类的(政治、法律)关键字的搜索结果列表页中,提取论文详情的URL地址 :param response: :return: """ print('分页请求成功:' + response.url) # 论文标题列表 # 使用正则获取当前分类关键字和当前的页码数 pattern = re.compile('.*?q=(.*?)\+.*?WF_(.*?)&.*?p=(\d+)', re.S) result = re.findall(pattern, response.url)[0] print(result) keyword = parse.unquote(result[0]) tag = result[1] currentPage = result[2] etree_html = etree.HTML(response.text) # record-item-list itemList = etree_html.xpath( '//div[@class="record-item-list"]/div[@class="record-item"]') print(tag + keyword + '第' + str(currentPage) + '页,' + '获取到了' + str(len(itemList)) + '条数据。') if len(itemList) > 0: for item in itemList: itemTitle = ''.join( item.xpath('.//a[@class="title"]//text()')).replace(' ', '') itemUrl = item.xpath('.//a[@class="title"]/@href')[0] itemId = itemUrl.split('/')[-1:][0] if tag == 'HY': # 会议的详情 # http://www.wanfangdata.com.cn/details/detail.do?_type=conference&id=7730508 # print(itemTitle, itemUrl, '会议的详情',itemId) newItemUrl = 'http://www.wanfangdata.com.cn/details/detail.do?_type=conference&id=' + itemId info = { 'searchKeyWord': keyword, 'searchType': 'conference', 'title': itemTitle } app.send_task('tasks.crawl_detail_with_url', args=(newItemUrl, ), kwargs=info) elif tag == "XW": # 学位的详情 # http://www.wanfangdata.com.cn/details/detail.do?_type=degree&id=D01551993 # print(itemTitle, itemUrl, '学位的详情',itemId) newItemUrl = 'http://www.wanfangdata.com.cn/details/detail.do?_type=degree&id=' + itemId info = { 'searchKeyWord': keyword, 'searchType': 'degree', 'title': itemTitle } app.send_task('tasks.crawl_detail_with_url', args=(newItemUrl, ), kwargs=info) elif tag == "QK": # 期刊的详情 # http://www.wanfangdata.com.cn/details/detail.do?_type=perio&id=bjgydxxb-shkx201902004 # print(itemTitle, itemUrl, '期刊的详情', itemId) newItemUrl = 'http://www.wanfangdata.com.cn/details/detail.do?_type=perio&id=' + itemId info = { 'searchKeyWord': keyword, 'searchType': 'perio', 'title': itemTitle } app.send_task('tasks.crawl_detail_with_url', args=(newItemUrl, ), kwargs=info) # 获取下一页 nextUrls = etree_html.xpath('//p[@class="pager"]//a[@class="page"]/@href')
from local_test.task1 import do_test import random from workers import app for i in range(1, 1000): x = random.randint(0, 100) y = random.randint(0, 100) # do_test.delay() app.send_task('local_test.task1.do_test', queue='task1_queue', routing_key='task1_routing', args=())
def manage_crawl_task(urls): for url in urls: app.send_task('tasks.crawl_pageurl_and_detailurl', args=(url, ))