def main_second(): ''' --> 从第二层url中解析出第三层的url并保存到second_url.txt文件中 :return: ''' file_name = 'first_floor.txt' file_path = os.path.join(path, file_name) file = open(file_path, encoding='utf-8') for per_column in file: try: url_dict = per_column.strip() url_item = json.loads(url_dict) categories_name = url_item['categories_name'] url = url_item['href'] # print(url) 测试通过 html = get_page_source(url) print(html) if categories_name in ["Women", "Men", "Girls", "Boys"]: parse_second_page_source_one(html, categories_name) elif categories_name in [ "Baby", "Novelty & More", "Luggage & Travel Gear" ]: parse_second_page_source_two(html, categories_name) else: parse_second_page_source_three(html, categories_name) except Exception as e: print(e) continue file.close()
def get_clothing_child_url(clothing_name=None, clothing_url=None): ''' 同样适用shoes, Watches, get_clothing_child_url(clothing_url=None) --> 获取衣服类下面所有的子类url :param: clothing_name: Women:Clothing :return: url 组成的list ''' # url = "https://www.amazon.com/s/ref=lp_7147440011_ex_n_2?rh=n%3A7141123011%2Cn%3A7147440011%2Cn%3A1040660&bbn=7147440011&ie=UTF8" html = get_page_source(clothing_url) # print(html) print('-' * 60) results = parse_clothing_child_page(html, clothing_name) # 是一个list for clothing_child in results: clothing_child_dict = {} if clothing_child['name'] in [ "Women:Clothing:Fashion Hoodies & Sweatshirts", "Women:Clothing:Jeans", "Women:Clothing:Leggings", "Women:Clothing:Jumpsuits, Rompers & Overalls" ]: clothing_child_dict["category_class"] = 0 clothing_child_dict["category_levels"] = clothing_child['name'] clothing_child_dict["category_url"] = clothing_child['href'] store_category_list(clothing_child_dict) else: # pass dresses_level_name = clothing_child['name'] dresses_level_url = clothing_child['href'] print('获取第四层连接:', dresses_level_url) print('-' * 60) get_dresses_level_child_url_list(dresses_level_url, dresses_level_name)
def test_case(): ''' --> 用于测试逻辑结果是否符合预期和是否发生异常 ''' test_url = 'https://www.amazon.com/s/ref=lp_7141123011_ex_n_8/143-3620478-1851336?rh=n%3A7141123011%2Cn%3A7586144011&bbn=7141123011&ie=UTF8' html = get_page_source(test_url) print(html) parse_second_page_source_three(html, 'Uniforms, Work & Safety')
def get_dresses_level_child_url_list(dresses_level_url=None, dresses_level_name=None): ''' 获取裙子或者裙子级别相同的所有类目list :param: dresses_level_name: Women:Clothing:Dresses ''' # dresses_url = 'https://www.amazon.com/s/ref=lp_1040660_ex_n_3/143-8365897-7769519?rh=n%3A7141123011%2Cn%3A7147440011%2Cn%3A1040660%2Cn%3A1045024&bbn=1040660&ie=UTF8' html = get_page_source(dresses_level_url) print('-'*60) parse_dresses_level_child_page(html, dresses_level_name)
def main_first(): ''' --> 把第一层的html解析出来第二层的url并保存到first_url.txt文件中 ''' first_url = "https://www.amazon.com/amazon-fashion/b/ref=topnav_storetab_sl?ie=UTF8&node=7141123011" html = get_page_source(first_url) # 获取网页的源代码 print(html) parse_first_page_source(html) # 剖析第一层html
def test_case(): # # url = "https://www.amazon.com/s/ref=lp_7141123011_ex_n_1?rh=n%3A7141123011%2Cn%3A7147440011&bbn=7141123011&ie=UTF8" # url = "https://www.amazon.com/s/ref=lp_7141123011_ex_n_11/137-9819825-1477654?rh=n%3A7141123011%2Cn%3A7586166011&bbn=7141123011&ie=UTF8" # html = get_page_source(url) # print(html) test_url = 'https://www.amazon.com/s/ref=lp_7141123011_ex_n_8/143-3620478-1851336?rh=n%3A7141123011%2Cn%3A7586144011&bbn=7141123011&ie=UTF8' html = get_page_source(test_url) print(html) parse_second_page_source_three(html, 'Uniforms, Work & Safety')
def get_clothing_child_url(clothing_url=None): url = "https://www.amazon.com/s/ref=lp_7147440011_ex_n_2?rh=n%3A7141123011%2Cn%3A7147440011%2Cn%3A1040660&bbn=7147440011&ie=UTF8" html = get_page_source(url) # print(html) parse_clothing_child_page(html)