示例#1
0
def crawl_caipu_detail(url, categoryId):
    html, url = send_request(url=url)
    #解析菜谱详情数据
    etree_html = etree.HTML(html)
    detail = {}
    # 标题
    detail['title'] = extract_first(
        etree_html.xpath('//h1[@class="page-title"]/text()'), '')
    #categoryId
    detail['categoryId'] = categoryId
    return detail
示例#2
0
def crawl_caipu_list(url, categoryId):
    """
    根据分类的url地址,请求获取菜谱列表页面
    :param url:
    :param categoryId:
    :return:
    """
    html, url = send_request(url=url)
    etree_html = etree.HTML(html)
    #提取菜谱列表数据,获取菜谱详情url地址
    caipu_as = etree_html.xpath('//ul[@class="list"]/li//p[@class="name"]/a')
    print('===================', len(caipu_as))
    for a in caipu_as:
        caipu_url = urljoin(baseUrl, extract_first(a.xpath('./@href')))
        # 根据详情url请求,获取详情数据
        print('菜谱url地址', caipu_url)
        crawl_caipu_detail.delay(caipu_url, categoryId)
    #下一页
    next_url = extract_first(etree_html.xpath('//a[@class="next"]/@href'))
    if next_url:
        next_url = urljoin(baseUrl, next_url)
        crawl_caipu_list.delay(next_url, categoryId)
示例#3
0
def download1(url):
    html, url = send_request(url)
    return str(len(html)) + '下载完毕1'
示例#4
0
def crawl_category_list(url):
    html, url = send_request(url=url)
    parse_category_data.delay(html)
示例#5
0
def download2(url):
    time.sleep(5)
    html, url = send_request(url)
    return str(len(html)) + '下载完毕2'