def crawl_caipu_detail(url, categoryId): html, url = send_request(url=url) #解析菜谱详情数据 etree_html = etree.HTML(html) detail = {} # 标题 detail['title'] = extract_first( etree_html.xpath('//h1[@class="page-title"]/text()'), '') #categoryId detail['categoryId'] = categoryId return detail
def crawl_caipu_list(url, categoryId): """ 根据分类的url地址,请求获取菜谱列表页面 :param url: :param categoryId: :return: """ html, url = send_request(url=url) etree_html = etree.HTML(html) #提取菜谱列表数据,获取菜谱详情url地址 caipu_as = etree_html.xpath('//ul[@class="list"]/li//p[@class="name"]/a') print('===================', len(caipu_as)) for a in caipu_as: caipu_url = urljoin(baseUrl, extract_first(a.xpath('./@href'))) # 根据详情url请求,获取详情数据 print('菜谱url地址', caipu_url) crawl_caipu_detail.delay(caipu_url, categoryId) #下一页 next_url = extract_first(etree_html.xpath('//a[@class="next"]/@href')) if next_url: next_url = urljoin(baseUrl, next_url) crawl_caipu_list.delay(next_url, categoryId)
def download1(url): html, url = send_request(url) return str(len(html)) + '下载完毕1'
def crawl_category_list(url): html, url = send_request(url=url) parse_category_data.delay(html)
def download2(url): time.sleep(5) html, url = send_request(url) return str(len(html)) + '下载完毕2'