def store_last_level_of_hierarchy(hierarchy, page_soup, url): response = page_soup if response: h4_tag = response.find('h4', {'class': H4_TAG_CLASS}) if h4_tag: category_name = string_format(h4_tag) category_url_tag = response.find('a', {'title': LAYOUT_PICKER}) # To get tiles view url if category_url_tag: category_url = url_format(category_url_tag['href']) else: category_url = url # Get hierarchy name if category_name in hierarchy: hierarchy_name = hierarchy else: hierarchy_name = '{}|{}'.format(hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) print line # store the line in a file and create hierarchy directory create_directory_and_hierarchy_files(hierarchy_name, line)
def find_hierarchy(hierarchy, url): """ :param hierarchy: category_hierarchy name :param url: current_page_url :return: None :working: recurssion function to find the hierarchy, last_page and products_page_url """ response = get_content(url) if response: sub_categories_container = response.find('ul', {'class': INDENT_TWO_CLASS}) if sub_categories_container: anchor_tags = sub_categories_container.findAll( 'a', {'class': NORMAL_ANCHOR_TAG_CLASS}) # If length of anchor tags is not zero then it contains more categories if len(anchor_tags) != 0: # Now for each category_url again call find_hierarchy function for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) find_hierarchy(hierarchy_name, category_url) # If length of anchor tags is zero then it is the last level of hierarchy # Now create directory and save hierarchy and url in a file in that directory else: h4_tag = response.find('h4', {'class': H4_TAG_CLASS}) if h4_tag: category_name = string_format(h4_tag) category_url_tag = response.find('a', {'title': LAYOUT_PICKER}) # To get tiles view url if category_url_tag: category_url = url_format(category_url_tag['href']) else: category_url = url # Get hierarchy name if category_name in hierarchy: hierarchy_name = hierarchy else: hierarchy_name = '{}|{}'.format( hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) print line # store the line in a file and create hierarchy directory create_directory_and_hierarchy_files(hierarchy_name, line)
def find_hierarchy(hierarchy_name, url): """ :param hierarchy_name: hierachy names with pipe_delimited format :param url: current page url :return: None """ response = get_content(url) if response: sub_category_container = find_sub_category_container(response) if sub_category_container: h4_tag = sub_category_container.find('h4', {'class': H4_TAG_CLASS}) # If it contains <h4> tag then it is last level of hierarchy if h4_tag: anchor_tag = h4_tag.find('a') category_url = url_format(anchor_tag['href']) line = '{}|{}'.format(hierarchy_name, category_url) print(line) # stote the line in a file and create hierarchy directory create_directory_and_hierarchy_files(hierarchy_name, line) # else it contains more categories then find urls and again call this function else: anchor_tags = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS}) for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy = '{}|{}'.format(hierarchy_name, category_name) # recurvisely calling this function find_hierarchy(hierarchy, category_url) sub_category_urls.append(url_format(anchor_tag['href']))