def parse_second_page_source_one(html, categories_name):
    '''
    --> 解析第二层链接的html中的(如女人中的衣服,鞋子...), 生成第三层需要的url
    情况为["Women", "Men", "Girls", "Boys"]时的解析方法
    :param html:
    :return: 第三层需要的url
    '''
    if html is None:
        return '获取网页失败'
    second_doc = pq(html)
    ul_items = second_doc("#leftNav ul").items()

    second_url_list = []
    category_dict = {}
    for ul in ul_items:
        list_a = []
        for all_a in ul.find('li span a').items():
            a_dict = {}
            a_name = all_a.find('span').text()
            a_url = 'https://www.amazon.com' + all_a.attr('href')
            a_dict[a_name] = a_url

            list_a.append(a_dict)
        second_url_list.append(list_a)

    if second_url_list:  # 转化成我们需要的格式,方便存储和整理
        second_url_list = second_url_list[-2:]
        category_dict[categories_name] = second_url_list[0]
        category_dict['shops'] = second_url_list[1]
        save_link_to_file(link_name='second_floor.txt', link_url=category_dict)
    else:
        print('触发了robot check')

    t = randint(1, 3)
    sleep(t)
def parse_second_page_source_three(html, categories_name):
    '''
    --> 情况为["Uniforms, Work & Safety", "Costumes & Accessories", \
    "Shoe, Jewelry & Watch Accessories"]时的解析方法
    '''
    page_three_doc = pq(html)
    if categories_name in ["Traditional & Cultural Wear"]:
        li_list_item = page_three_doc(
            '#leftNav > ul:nth-child(3) > ul > li > span > ul > div > li')
    else:
        li_list_item = page_three_doc(
            '#leftNav > ul:nth-child(6) > ul > li > span > ul > div > li')

    category_dict = {}
    second_url_list = []
    for li in li_list_item.items():
        a_dict = {}
        a_name = li.find('span > a > span').text()
        a_url = 'https://www.amazon.com' + li.find('span a').attr('href')
        a_dict[a_name] = a_url
        second_url_list.append(a_dict)

    category_dict[categories_name] = second_url_list
    save_link_to_file(link_name='second_floor.txt', link_url=category_dict)
    # print(category_dict)

    t = randint(1, 3)
    sleep(t)
def parse_second_page_source_two(html, categories_name):
    '''
    情况为["Baby", "Novelty & More", "Luggage & Travel Gear"]时的解析方法
    :param html:
    :param categories_name:
    :return:
    '''
    page_two_doc = pq(html)
    li_list_item = page_two_doc(
        '#leftNav > ul > ul > li > span > ul > div > li')

    category_dict = {}
    second_url_list = []
    for li in li_list_item.items():
        a_dict = {}
        a_name = li.find('span > a > span').text()
        a_url = 'https://www.amazon.com' + li.find('span a').attr('href')
        a_dict[a_name] = a_url
        second_url_list.append(a_dict)

    category_dict[categories_name] = second_url_list
    save_link_to_file(link_name='second_floor.txt', link_url=category_dict)
    # print(category_dict)
    t = randint(1, 3)
    sleep(t)
Exemplo n.º 4
0
def parse_dresses_level_child_page(html=None, parent_name=None):
    '''
    解析裙子或和裙子网页结构相似的页面
    :param parent_name: Women:Clothing:Dresses
    :return: [{'child_name': 'Women:Clothing:Dresses:Casual',...}]
    '''
    clothing_doc = pq(html)
    ul_item = clothing_doc(
        '#leftNav > ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-base')
    span_item = ul_item.find('ul > li > span')
    span_list_item = span_item.find('div > li > span')
    li_item = span_list_item.find(
        'ul.a-unordered-list.a-nostyle.a-vertical.s-ref-indent-one > div > li')
    # print(li_item)
    li_list_item = li_item.find(
        'ul.a-unordered-list.a-nostyle.a-vertical.s-ref-indent-one > div > li')
    print(li_list_item)
    print('-' * 60)

    li_item_list = []
    for li_item in li_list_item.items():
        dresses_dict = {}
        child_name = li_item.find('span > a > span').text()
        child_href = 'https://www.amazon.com' + li_item.find('span > a').attr(
            'href')
        if child_name in [
                'Lingerie', 'Sleep & Lounge', 'Thermal Underwear', 'Bikinis',
                'Tankinis', 'One-Pieces', 'Cover-Ups', 'Board Shorts',
                'Racing', 'Rash Guards', 'Down & Parkas', 'Wool & Pea Coats',
                'Trench, Rain & Anoraks', 'Quilted Lightweight Jackets',
                'Casual Jackets', 'Denim Jackets', 'Leather & Faux Leather',
                'Fur & Faux Fur', 'Vests', 'Active & Performance'
        ]:

            dresses_dict['category_levels'] = "{}:{}".format(
                parent_name, child_name)
            dresses_dict['category_url'] = child_href
            save_link_to_file('clothing_fifth_floor.txt', dresses_dict)
            li_item_list.append(dresses_dict)
        else:
            dresses_dict['category_class'] = 1
            dresses_dict['category_levels'] = "{}:{}".format(
                parent_name, child_name)
            dresses_dict['category_url'] = child_href
            store_category_list(dresses_dict)
            li_item_list.append(dresses_dict)

    print('-' * 60)
    print(li_item_list)
    return li_item_list
def parse_first_page_source(html):
    '''
    --> 解析服装类第一层链接的html中的(女人,男人,女孩,男孩...), 生成第二层需要的url
    :param html:
    :return: 第二层需要的url
    '''
    first_doc = pq(html)
    li_items = first_doc("#leftNav").find("li").items()
    for li in li_items:
        categories_dict = {}
        Categories_name = li.find('a h4').text()
        href = 'https://www.amazon.com' + li.find('a').attr('href')
        categories_dict['categories_name'] = Categories_name
        categories_dict['href'] = href
        save_link_to_file(link_name='first_floor.txt',
                          link_url=categories_dict)