Пример #1
0
def page_category(url):
    print('*****start page_categorie()开始单个分类页的爬取')
    recipes = []

    base_url = url
    urls = []  # 放置页数的url,这里构造三页的page url
    # 构建url,一共为三页
    for x in range(1, 4):
        urls.append(base_url + 'page/' + str(x) + '/')
    # 针对url做遍历,和发出请求
    for url in urls:
        html = meishi_requests.get(url)
        for item in html.xpath('//div[@id="J_list"]//li'):
            temp = {}
            temp['show_img'] = verify_text(
                item.xpath('./div[@class="pic"]//img/@data-src'))
            temp['show_title'] = verify_text(
                item.xpath('./div[@class="detail"]//a/text()'))
            temp['show_username'] = verify_text(
                item.xpath(
                    './div[@class="detail"]//p[@class="subline"]/a/text()'))
            temp['material'] = verify_text(
                item.xpath('.//p[@class="subcontent"]/text()'))
            temp['detail'] = detail.parse_detail_recipe(
                item.xpath('./div[@class="detail"]//a/@href')[0])
            recipes.append(temp)
            # break
        # break
    return recipes
    print('*****start get_all_category()开始所有分类页的爬取')
Пример #2
0
def get_index_categories():
    # 获取首页下分类的数据(只获取前两个)
    print('*****start index_categories() 开始首页分类数据提取')

    html = meishi_requests.get('https://www.meishichina.com/')
    # 获取分类名,装到列表中(不用验证了,可确定性很大)
    categories = html.xpath('//div[@class="w5"]//h3[position() <3]/a/text()')
    # 获取列表下的食谱详细数据
    all_recipes = []
    for item in html.xpath(
            '//div[@class="w5"]//div[@class="big4_list clear mt10"]/ul[position() < 3]'
    ):
        recipes = []
        for i in item.xpath('./li'):
            temp = {}
            temp['show_title'] = verify_text(i.xpath('.//p/text()'))
            temp['show_username'] = verify_text(
                i.xpath('.//a[@class="u"]/text()'))
            temp['show_img'] = verify_text(i.xpath('.//img/@data-src'))
            temp['detail'] = detail.parse_detail_recipe(
                verify_text(i.xpath('.//a[1]/@href')))
            recipes.append(temp)
        all_recipes.append(recipes)
    # 整合数据
    for index, item in enumerate(all_recipes):
        data = {}
        data['categories'] = categories[index] if categories else ''
        data['recipes'] = item
        write_data('categories/index-categories', str(index + 1), data)
    # 写入数据
    print('*****done index_categories() 首页分类数据提取结束')
Пример #3
0
def get_all_categories():
    # 获取所有分类中的菜单,其实也就获取页面前三十个,并且每个爬取3页
    print('*****start get_all_categories()开始所有分类页的爬取')
    html = meishi_requests.get('https://home.meishichina.com/recipe-type.html')
    # 下面数据不用验证了,可确定性很大
    for index, item in enumerate(
            html.xpath('//div[@class="category_sub clear"][1]//li')):
        data = {}
        data['categories'] = item.xpath('./a/@title')[0]
        data['recipes'] = page_category(item.xpath('./a/@href')[0])
        write_data('categories/all-categories', str(index + 1), data)
        # break
    print('*****done get_all_categories()所有分类页的爬取结束')
Пример #4
0
def get_detail_categories():
    # 爬取详细分类的数据
    print('*****start get_detail_categories() 开始详细分类的数据提取')
    # 请求菜谱分类页
    html = meishi_requests.get('https://home.meishichina.com/recipe.html')
    # 提取每个分类的名字(不用校验,数据可确定性已经很高)
    for index, h3 in enumerate(html.xpath('//div[@class="ui_title"]//h3')):
        data = {}
        data['categories'] = h3.xpath('./a/text()')[0]
        data['recipes'] = get_ajax_data(
            h3.xpath('./a/@data')[0],
            h3.xpath('./a/@order')[0])
        # 写入数据
        write_data('categories/detail-categories', str(index + 1), data)
    print('*****done get_detail_categories() 详细分类的数据提取结束')
Пример #5
0
def get_slider_URL():
    # 获取首页轮播图那里的url
    print('*****start get_slider_URL()提取轮播图url')
    data = {}
    html = meishi_requests.get('https://www.meishichina.com/')
    slider_pages_url = html.xpath(
        '//div[@id="home_index_slider"]/ul/li/a[@title != "2020,人人都是美食家"]/@href'
    )
    imgs_url = html.xpath(
        '//div[@id="home_index_slider"]/ul/li/a[@title != "2020,人人都是美食家"]/img/@src'
    )
    data['slider_pages_url'] = slider_pages_url
    data['imgs_url'] = imgs_url
    print('*****done get_slider_URL()提取轮播图url结束')
    return data
Пример #6
0
def parse_menu_pages(temporary_data):
    # 处理每个菜单页面中的数据
    print('*****start parse_menu_pages(temporary_data) 开始菜单的页面数据处理')
    data = {}
    # 提取首页显示的数据
    data['home_title'] = temporary_data['home_title']
    data['home_content'] = temporary_data['home_content']
    data['username'] = temporary_data['username']
    # 发起请求
    html = meishi_requests.get(temporary_data['url'])
    # 解析数据
    data['page_data'] = {}
    # 标题
    data['page_data']['title'] = html.xpath(
        '//a[@id="collect_title"]/text()')[0].strip()
    # 创建时间
    data['page_data']['creation_time'] = init_time(
        html.xpath('//div[@class="collect_dp"]/span/text()')[0])
    # 用户名
    data['page_data']['username'] = temporary_data['username']
    # 页面中的食谱信息
    data['page_data']['recipes'] = []
    for item in html.xpath('//div[@id="J_list"]//li'):
        temp = {}
        temp['show_img'] = verify_text(
            item.xpath('./div[@class="pic"]//img/@data-src'))
        temp['show_title'] = verify_text(item.xpath(
            './div[@class="detail"]//a/text()'))
        temp['show_username'] = verify_text(item.xpath(
            './div[@class="detail"]//p[@class="subline"]/a/text()'))
        temp['material'] = verify_text(
            item.xpath('.//p[@class="subcontent"]/text()'))
        temp['detail'] = detail.parse_detail_recipe(
            item.xpath('./div[@class="detail"]//a/@href')[0])
        data['page_data']['recipes'].append(temp)
        # temp['detail']
    # 格式化数据
    # 把数据写入到文件
    global INDEX
    with open('./menu_data/'+str(INDEX)+'.txt', 'w', encoding='utf-8') as fp:
        print('开始写入文件')
        fp.write(json.dumps(data, ensure_ascii=False))
        print('写入文件结束')
    INDEX += 1
    print('*****done parse_menu_pages(temporary_data) 菜单的页面数据处理完毕')
Пример #7
0
def parse_slider_pages(slider_page_url, slpage_img_url):
    # 处理进入每个轮播图中的网页数据
    print('*****start parse_slider_pages(slider_page_url)提取轮播图页面数据')
    page_data = {}  # slider_pages_data下的{}
    page_data['img_url'] = slpage_img_url
    html = meishi_requests.get(slider_page_url)

    # 获取页面描述
    page_data['desc'] = verify_text(html.xpath('//p[@id="mof_desc"]/text()'))
    # 获取分标题和其内容
    page_data['list'] = []  # slider_pages_data下的{}的list[]
    # 标题
    mo_result = verify_list(
        html.xpath('//div[@class="mo" and position() < last()-1]/h2/text()'))
    # 内容
    p_list = verify_list(html.xpath('//div[@class="msb"]/p/text()'))
    for index, p in enumerate(p_list):
        p_list[index] = p.replace('\n', '').strip('')

    # 每部分的菜谱
    recipes_result = []  # slider_pages_data下的{}的list[]的对象下的recipes[]
    msb_ul = verify_list(html.xpath('//div[@class="msb_list clear"]/ul'))
    for item in msb_ul:
        # 得到每个包含详细的菜谱li集合,丢给detail模块处理,然后返回一个列表
        # print(item.xpath('.//li'))
        recipes_result.append(
            detail.parse_slider_recipes_pages(item.xpath('./li')))

    # 把相对应的标题,内容,菜谱集合进行数据重组
    for index, item in enumerate(recipes_result):
        temp = {}
        temp['title'] = mo_result[index] if mo_result else ''
        temp['content'] = p_list[index] if p_list else ''
        temp['recipes'] = recipes_result[index]
        page_data['list'].append(temp)

    global INDEX
    with open('./slider_data/' + str(INDEX) + '.txt', 'w',
              encoding='utf-8') as fp:
        print('开始写入文件')
        fp.write(json.dumps(page_data, ensure_ascii=False))
    INDEX += 1
    # print(json.dumps(page_data, ensure_ascii=False))
    print('*****done parse_slider_pages(slider_page_url)提取轮播图页面数据结束')
Пример #8
0
def get_menus_URL():
    # 获取首页菜单的各个url
    print('*****start get_menus_URL() 开始提取首页的菜单url')
    data_temp = []
    # 发起请求
    html = meishi_requests.get('https://www.meishichina.com/')
    # 底下不要做验证,这边出错是得到数据致命性错误的,不能让程序修正
    for item in html.xpath('//div[@id="w2_slider"]//li'):
        temp = {}
        temp['url'] = item.xpath('.//a/@href')[0]
        temp['home_title'] = item.xpath('.//a/text()')[0].strip()
        temp['home_content'] = init_text(item.xpath('./p/text()'))
        temp['username'] = item.xpath('./p/span/text()')[0].strip()
        data_temp.append(temp)
        # print(json.dumps(temp,ensure_ascii=False))

    print('*****done get_menus_URL() 提取首页的菜单url结束')
    # 返回解析到的url,和首页中展示的数据
    return data_temp