def parse_second_page_source_one(html, categories_name): ''' --> 解析第二层链接的html中的(如女人中的衣服,鞋子...), 生成第三层需要的url 情况为["Women", "Men", "Girls", "Boys"]时的解析方法 :param html: :return: 第三层需要的url ''' if html is None: return '获取网页失败' second_doc = pq(html) ul_items = second_doc("#leftNav ul").items() second_url_list = [] category_dict = {} for ul in ul_items: list_a = [] for all_a in ul.find('li span a').items(): a_dict = {} a_name = all_a.find('span').text() a_url = 'https://www.amazon.com' + all_a.attr('href') a_dict[a_name] = a_url list_a.append(a_dict) second_url_list.append(list_a) if second_url_list: # 转化成我们需要的格式,方便存储和整理 second_url_list = second_url_list[-2:] category_dict[categories_name] = second_url_list[0] category_dict['shops'] = second_url_list[1] save_link_to_file(link_name='second_floor.txt', link_url=category_dict) else: print('触发了robot check') t = randint(1, 3) sleep(t)
def parse_second_page_source_three(html, categories_name): ''' --> 情况为["Uniforms, Work & Safety", "Costumes & Accessories", \ "Shoe, Jewelry & Watch Accessories"]时的解析方法 ''' page_three_doc = pq(html) if categories_name in ["Traditional & Cultural Wear"]: li_list_item = page_three_doc( '#leftNav > ul:nth-child(3) > ul > li > span > ul > div > li') else: li_list_item = page_three_doc( '#leftNav > ul:nth-child(6) > ul > li > span > ul > div > li') category_dict = {} second_url_list = [] for li in li_list_item.items(): a_dict = {} a_name = li.find('span > a > span').text() a_url = 'https://www.amazon.com' + li.find('span a').attr('href') a_dict[a_name] = a_url second_url_list.append(a_dict) category_dict[categories_name] = second_url_list save_link_to_file(link_name='second_floor.txt', link_url=category_dict) # print(category_dict) t = randint(1, 3) sleep(t)
def parse_second_page_source_two(html, categories_name): ''' 情况为["Baby", "Novelty & More", "Luggage & Travel Gear"]时的解析方法 :param html: :param categories_name: :return: ''' page_two_doc = pq(html) li_list_item = page_two_doc( '#leftNav > ul > ul > li > span > ul > div > li') category_dict = {} second_url_list = [] for li in li_list_item.items(): a_dict = {} a_name = li.find('span > a > span').text() a_url = 'https://www.amazon.com' + li.find('span a').attr('href') a_dict[a_name] = a_url second_url_list.append(a_dict) category_dict[categories_name] = second_url_list save_link_to_file(link_name='second_floor.txt', link_url=category_dict) # print(category_dict) t = randint(1, 3) sleep(t)
def parse_dresses_level_child_page(html=None, parent_name=None): ''' 解析裙子或和裙子网页结构相似的页面 :param parent_name: Women:Clothing:Dresses :return: [{'child_name': 'Women:Clothing:Dresses:Casual',...}] ''' clothing_doc = pq(html) ul_item = clothing_doc( '#leftNav > ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-base') span_item = ul_item.find('ul > li > span') span_list_item = span_item.find('div > li > span') li_item = span_list_item.find( 'ul.a-unordered-list.a-nostyle.a-vertical.s-ref-indent-one > div > li') # print(li_item) li_list_item = li_item.find( 'ul.a-unordered-list.a-nostyle.a-vertical.s-ref-indent-one > div > li') print(li_list_item) print('-' * 60) li_item_list = [] for li_item in li_list_item.items(): dresses_dict = {} child_name = li_item.find('span > a > span').text() child_href = 'https://www.amazon.com' + li_item.find('span > a').attr( 'href') if child_name in [ 'Lingerie', 'Sleep & Lounge', 'Thermal Underwear', 'Bikinis', 'Tankinis', 'One-Pieces', 'Cover-Ups', 'Board Shorts', 'Racing', 'Rash Guards', 'Down & Parkas', 'Wool & Pea Coats', 'Trench, Rain & Anoraks', 'Quilted Lightweight Jackets', 'Casual Jackets', 'Denim Jackets', 'Leather & Faux Leather', 'Fur & Faux Fur', 'Vests', 'Active & Performance' ]: dresses_dict['category_levels'] = "{}:{}".format( parent_name, child_name) dresses_dict['category_url'] = child_href save_link_to_file('clothing_fifth_floor.txt', dresses_dict) li_item_list.append(dresses_dict) else: dresses_dict['category_class'] = 1 dresses_dict['category_levels'] = "{}:{}".format( parent_name, child_name) dresses_dict['category_url'] = child_href store_category_list(dresses_dict) li_item_list.append(dresses_dict) print('-' * 60) print(li_item_list) return li_item_list
def parse_first_page_source(html): ''' --> 解析服装类第一层链接的html中的(女人,男人,女孩,男孩...), 生成第二层需要的url :param html: :return: 第二层需要的url ''' first_doc = pq(html) li_items = first_doc("#leftNav").find("li").items() for li in li_items: categories_dict = {} Categories_name = li.find('a h4').text() href = 'https://www.amazon.com' + li.find('a').attr('href') categories_dict['categories_name'] = Categories_name categories_dict['href'] = href save_link_to_file(link_name='first_floor.txt', link_url=categories_dict)