def get_product_urls(hierarchy_url): """ :param hierarchy_url: string in this format 'hierarchy|page_url" :return: None :working : requsts the url and then find last page and call's traverse_page function """ name_list = hierarchy_url.split('|') hierarchy_name = '|'.join(name_list[0:-1]) page_url = name_list[-1] urls_list = [] completed_path = '{}{}{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME, DataCollectors_Configuration.PATH_STYLE, hierarchy_name.replace('|', DataCollectors_Configuration.PATH_STYLE), DataCollectors_Configuration.PATH_STYLE, COMPLETED_PAGE) completed_set = file_to_set(completed_path) if in_completed_urls(page_url, completed_set): pass else: response = get_content(page_url) if response: product_url_tags = response.findAll('a', { 'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'}) if len(product_url_tags) != 0: for product_url_tag in product_url_tags: product_url = url_format(product_url_tag['href']) line = '{}|{}'.format(hierarchy_name, product_url) urls_list.append(line) update_files(hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE) last_page = find_last_page(response) traverse_pages(hierarchy_name, page_url, last_page, completed_set)
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: category_hierarchy name :param url: current_page_url :return: None :working: this function will find out 1st level of hierarchy and adds link to the queue """ response = get_content(url) if response: sub_categories_container = response.find('ul', {'class': INDENT_ONE_CLASS}) if sub_categories_container: anchor_tags = sub_categories_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS}) for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) # print(line) urls_queue.join()
def find_nav_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current url :return: None """ response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') # for all nav_string find see more tag and hirarachy name and for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS}) if see_more_tag: category_url = url_format(see_more_tag.a["href"]) hierarchy_name = '{}|{}'.format( hierarchy, main_category_name) # for current url find the traverse style as it was find_traverse_type(hierarchy_name, category_url, False)
def get_indent_two_hierarchy(hierarchy, page_soup, url): """ :param url: Current Page Url :param hierarchy: hierarchy_name :param page_soup: BeautifulSoup response :return: None """ response = page_soup if response: sub_categories_container = response.find('ul', {'class': INDENT_TWO_CLASS}) if sub_categories_container: anchor_tags = sub_categories_container.findAll( 'a', {'class': NORMAL_ANCHOR_TAG_CLASS}) # If length of anchor tags is not zero then it contains more categories if len(anchor_tags) != 0: # Now for each category_url again call find_hierarchy function for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) # If length of anchor tags is zero then it is the last level of hierarchy # Now create directory and save hierarchy and url in a file in that directory else: store_last_level_of_hierarchy(hierarchy, response, url)
def collect_all_data(hierarchy, url, last_page, completed_set): """ :param hierarchy: hierarchy name :param url: current page url :param last_page: last page number :param completed_set: completed url sets to compare :return: None :working: collects products url from all pages """ url_list = [] for pageNo in range(2, last_page): current_page = '{}&page={}'.format(url, pageNo) if in_completed_urls(current_page, completed_set): continue else: response = get_content(current_page) if response: product_url_tags = response.findAll('a', { 'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'}) if len(product_url_tags) != 0: for product_url_tag in product_url_tags: product_url = url_format(product_url_tag['href']) line = '{}|{}'.format(hierarchy, product_url) url_list.append(line) print('{}|{}'.format(hierarchy, current_page)) update_files(hierarchy, url_list, current_page, PRODUCTS_INFO_FILE, COMPLETED_PAGE)
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: last level hierarchy name :param url: current page url :return: None """ response = get_content(url) if response: category_container = response.findAll('a', {'class': 'nav-a'}) if len(category_container) != 0: for anchor_tag in category_container: # if anchor_tag.get('tabindex') == '66' or anchor_tag.get('tabindex') =='67': category_name = string_format(anchor_tag) if category_name in SELECTED_LIST: category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) response = get_content(category_url) category_response_type_1 = response.find( 'div', {'class': LEFT_NAV_CLASS}) category_response_type_2 = response.find( 'ol', {'class': CAROUSAL_CLASS}) if category_response_type_1: find_nav_hierarchy(hierarchy_name, response) elif category_response_type_2: find_carousel_hierarchy(hierarchy_name, response) urls_queue.join()
def find_nav_hierarchy(hierarchy, page_soup): response = page_soup if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) if 'Featured_Stores' in main_category_name: anchor_tags = nav_html.find_all('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) if category_name in IGNORE_LIST: continue else: category_url = url_format( anchor_tag['href']) hierarchy_name = '{}|{}'.format( hierarchy, category_name) find_traverse_type( hierarchy_name, category_url)
def find_hierarchy(hierarchy, url): """ :param hierarchy: category_hierarchy name :param url: current_page_url :return: None :working: recurssion function to find the hierarchy, last_page and products_page_url """ response = get_content(url) if response: sub_categories_container = response.find('ul', {'class': INDENT_TWO_CLASS}) if sub_categories_container: anchor_tags = sub_categories_container.findAll( 'a', {'class': NORMAL_ANCHOR_TAG_CLASS}) # If length of anchor tags is not zero then it contains more categories if len(anchor_tags) != 0: # Now for each category_url again call find_hierarchy function for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) find_hierarchy(hierarchy_name, category_url) # If length of anchor tags is zero then it is the last level of hierarchy # Now create directory and save hierarchy and url in a file in that directory else: store_last_level_of_hierarchy(hierarchy, response, url)
def store_last_level_of_hierarchy(hierarchy, page_soup, url): response = page_soup if response: h4_tag = response.find('h4', {'class': H4_TAG_CLASS}) if h4_tag: category_name = string_format(h4_tag) category_url_tag = response.find('a', {'title': LAYOUT_PICKER}) # To get tiles view url if category_url_tag: category_url = url_format(category_url_tag['href']) else: category_url = url # Get hierarchy name if category_name in hierarchy: hierarchy_name = hierarchy else: hierarchy_name = '{}|{}'.format(hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) print line # store the line in a file and create hierarchy directory create_directory_and_hierarchy_files(hierarchy_name, line)
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current page url :return: none :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style and then adds that links to queue """ response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS}) if see_more_tag: if 'Neuheiten' in main_category_name or 'Shops' in main_category_name: continue else: category_url = url_format(see_more_tag.a["href"]) hierarchy_name = '{}|{}'.format(hierarchy, main_category_name) # for current url find the traverse style as it was line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) # print line else: anchor_tags = nav_html.findAll('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: sub_category_name = string_format(anchor_tag) if 'Neuheiten' in main_category_name or 'Shops' in main_category_name: continue else: hierarchy_name = '{}|{}|{}'.format(hierarchy, main_category_name, sub_category_name) sub_category_url = url_format(anchor_tag['href']) line = '{}|{}'.format(hierarchy_name, sub_category_url) urls_queue.put(line)
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: Hierarchy name :param url: current page url :return: NOne """ response = get_content(url) if response: container = response.find('div', {'class': BOX_GRID_CONTAINER_CLASS}) if container: category_containers = container.findAll( 'div', {'class': SMALL_BOX_GRID_CLASS}) if len(category_containers) != 0: for category in category_containers: category_name = string_format(category.img['alt']) category_url = url_format(category.find('a')['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) category_response = get_content(category_url) """ :logic - At first we will find all possible class names from response and we will check which type of class got response and call that specific function to proceed further """ if category_response: category_response_type_1 = category_response.find( "ul", {'class': INDENT_ONE_CLASS}) category_response_type_2 = category_response.findAll( 'div', {'class': SMALL_BOX_GRID_CLASS}) category_response_type_3 = category_response.find( 'div', {'class': LEFT_NAV_CLASS}) category_response_type_4 = category_response.find( 'ol', {'class': CAROUSAL_CLASS}) if category_response_type_1: find_level_1_hierarchy(hierarchy_name, category_response_type_1) elif len(category_response_type_2) != 0: find_box_grid_hierarchy(hierarchy_name, category_response) elif category_response_type_3: find_left_nav_hierarchy(hierarchy_name, category_response_type_3) elif category_response_type_4: find_carousel_hierarchy(hierarchy_name, category_response) else: line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) urls_queue.join()
def get_seller_strore_link(raw_data): store_link_tag = raw_data.find('div', {'id': 'storefront-link'}) if store_link_tag: if store_link_tag.a: return url_format(store_link_tag.a['href']) else: return 'not_avaible' else: return 'not_avaible'
def find_carousal_sub_categories(hierarchy, page_soup, anchor_tag): """ :param hierarchy: Hierarchy name :param page_soup: BeautifulSoup response :param anchor_tag: anchor tag to get category name and url :return: none :working: It wll find all the sub categories under carousal division and add them to url list """ category_name = string_format(anchor_tag) category_id = anchor_tag['id'] if category_name in IGNORE_LIST: pass else: sub_cat_id = 'sub{}'.format(category_id) sub_category_container = page_soup.find('div', {'id': sub_cat_id}) if sub_category_container: sub_category_list = sub_category_container.findAll( 'a', {'class': SUB_CATEGORY_LIST_ANCHOR_TAG_CLASS}) if len(sub_category_list) != 0: for sub_category in sub_category_list: sub_category_name = string_format(sub_category) sub_category_url = url_format(sub_category['href']) if sub_category_name.replace(category_name, '') in IGNORE_LIST: # If it is in ignore list skip that link pass else: hierarchy_name = '{}|{}|{}'.format( hierarchy, category_name, sub_category_name) line = '{}|{}'.format(hierarchy_name, sub_category_url) # add links to the queue print(line) urls_queue.put(line) else: sub_category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, sub_category_url) # add links to the queue urls_queue.put(line)
def collect_urls(hierarchy, raw_data): category_container = raw_data if category_container: category_name_tag = category_container.find( 'div', {'class': 'acs-category-tile-header'}) category_url_tag = category_container.find( 'div', {'class': 'acs-category-tile-shopall '}) category_urls_list = category_container.findAll('li') if category_name_tag: category_name = string_format(category_name_tag) hierarchy_name = '{}|{}'.format(hierarchy, category_name) if category_url_tag: category_url = url_format(category_url_tag.a['href']) line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) else: for category_url_tag in category_urls_list: category_url = url_format(category_url_tag.a['href']) line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line)
def get_level_1_see_more_hierarchy(hierarchy, page_soup): response = page_soup if response: see_more_tags = response.findAll("p", {"class": SEE_MORE_CLASS}) if len(see_more_tags) != 0: for see_more_tag in see_more_tags: category_name = string_format(str(see_more_tag.text[7:])) category_url = url_format(see_more_tag.a["href"]) hierarchy_name = '{}|{}'.format(hierarchy, category_name) find_traverse_type(hierarchy_name, category_url)
def find_hierarchy(hierarchy_name, url): """ :param hierarchy_name: hierachy names with pipe_delimited format :param url: current page url :return: None """ response = get_content(url) if response: sub_category_container = find_sub_category_container(response) if sub_category_container: h4_tag = sub_category_container.find('h4', {'class': H4_TAG_CLASS}) # If it contains <h4> tag then it is last level of hierarchy if h4_tag: anchor_tag = h4_tag.find('a') category_url = url_format(anchor_tag['href']) line = '{}|{}'.format(hierarchy_name, category_url) print(line) # stote the line in a file and create hierarchy directory create_directory_and_hierarchy_files(hierarchy_name, line) # else it contains more categories then find urls and again call this function else: anchor_tags = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS}) for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy = '{}|{}'.format(hierarchy_name, category_name) # recurvisely calling this function find_hierarchy(hierarchy, category_url) sub_category_urls.append(url_format(anchor_tag['href']))
def check_and_get_seller_data(raw_data): seller_name_tag = raw_data.find('a') if seller_name_tag: seller_name = text_format(seller_name_tag) seller_link = url_format(seller_name_tag['href']) seller_raw_data = response_getter.get_content(seller_link) if seller_raw_data: return get_seller_info(seller_name, seller_raw_data) else: return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available' else: seller_name = text_format(raw_data) if seller_name: return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available' return 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'
def find_level_1_hierarchy(hierarchy, page_soup): category_container = page_soup if category_container: sub_category_container = category_container.find( "span", {'class': 'a-list-item'}) if sub_category_container: sub_category_list = sub_category_container.findAll( 'a', {'class': NORMAL_ANCHOR_TAG_CLASS}) for sub_category in sub_category_list: sub_category_name = string_format(sub_category) hierarchy_name = '{}|{}'.format(hierarchy, sub_category_name) hierarchy_url = url_format(sub_category['href']) line = '{}|{}'.format(hierarchy_name, hierarchy_url) # add links to the queue urls_queue.put(line)
def find_acs_nav_section(hierarchy, url): """ :param hierarchy: hierarchy name :param url: current page url :return: None :Working: it will find out 1st level of hierarchy after that adds link into queue """ response = get_content(url) if response: category_container = response.find( 'div', {'class': 'a-section a-spacing-base'}) if category_container: category_list = category_container.findAll( 'div', {'class': ACS_SECTION_CLASS}) if len(category_list) != 0: for category in category_list: category_tag = category.find('div', {'class': 'acs-ln-links'}) if category_tag: category_name = string_format(category_tag) if category_name in IGNORE_LIST: continue else: sub_category_links = category.findAll('a') if len(sub_category_links) != 0: for sub_category_link in sub_category_links: sub_category_name = string_format( sub_category_link) sub_category_url = url_format( sub_category_link['href']) hierarchy_name = '{}|{}|{}'.format( hierarchy, category_name, sub_category_name) line = '{}|{}'.format( hierarchy_name, sub_category_url) if 'Tutto' in hierarchy_name: urls_queue.put(line) else: pass
def get_level_1_hierarchy(hierarchy, page_soup): response = page_soup if response: category_name = response.find('p') if category_name: main_category_name = string_format(category_name) anchor_tags = response.findAll('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: sub_category_name = string_format(anchor_tag) if sub_category_name.replace(main_category_name, '') in IGNORE_LIST: continue else: sub_category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}|{}'.format( hierarchy, main_category_name, sub_category_name) find_traverse_type(hierarchy_name, sub_category_url)
def find_nav_hierarchy(hierarchy, page_soup): response = page_soup if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS}) if see_more_tag: category_url = url_format(see_more_tag.a["href"]) hierarchy_name = '{}|{}'.format( hierarchy, main_category_name) line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line)
def find_left_nav_hierarchy(hierarchy, page_soup): category_container = page_soup toggle = True if category_container: category_list = category_container.findAll('ul') if len(category_list) != 0: for category in category_list: if toggle: toggle = False else: category_tag = category.find('a') if category_tag: category_name = string_format(category_tag) category_url = url_format(category_tag['href']) hierarchy_name = '{}|{}'.format( hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) # add links to the queue urls_queue.put(line)
def find_nav_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current url :return: None """ response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') # for all nav_string find see more tag and hirarachy name and for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) anchor_tags = nav_html.findAll('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: sub_category_name = string_format(anchor_tag) if sub_category_name.replace( main_category_name, '') in IGNORE_LIST: continue else: sub_category_url = url_format( anchor_tag['href']) hierarchy_name = '{}|{}|{}'.format( hierarchy, main_category_name, sub_category_name) if 'Jouets_par_cate_gorie' in hierarchy_name: print hierarchy_name find_traverse_type( hierarchy_name, sub_category_url, False)
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current page url :return: none :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style and then adds that links to queue """ response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) anchor_tags = nav_html.findAll('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: sub_category_name = string_format(anchor_tag) if 'Shop_by_Genre' in main_category_name: hierarchy_name = '{}|{}'.format( hierarchy, sub_category_name) sub_category_url = url_format( anchor_tag['href']) line = '{}|{}'.format( hierarchy_name, sub_category_url) urls_queue.put(line) else: continue
def get_nav_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current url :return: None :working : It will find the 1st level of hierarchy and add the links to the queue """ response_container = get_content(url) if response_container: response = response_container.find( 'div', {'class': 'a-section a-spacing-base'}) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') # for all nav_string find see more tag and hirarachy name and store sub category url for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format( category_container) see_more_tag = nav_html.find( "p", {"class": SEE_MORE_CLASS}) if see_more_tag: category_url = url_format( see_more_tag.a["href"]) hierarchy_name = '{}|{}'.format( hierarchy, main_category_name) line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) print line
def find_acs_nav_section(hierarchy, url): """ :param hierarchy: HIerarchy name :param url: current page url :return: NOne :Working: This function find 1st level for hierarchy which contains acs widhget as class name and adds the link into queue """ response = get_content(url) if response: category_container = response.find('div', {'class': ACS_WIDGET_LEFT_NAV_CLASS}) if category_container: category_list = category_container.findAll('div', {'class': ACS_SECTION_CLASS}) if len(category_list) != 0: for category in category_list: category_tag = category.find('button', {'class': 'acs-ln-header '}) if category_tag: category_name = string_format(category_tag) if category_name in IGNORE_LIST: continue else: sub_category_links = category.findAll('a') if len(sub_category_links) != 0: for sub_category_link in sub_category_links: sub_category_name = string_format(sub_category_link) sub_category_url = url_format(sub_category_link['href']) hierarchy_name = '{}|{}|{}'.format(hierarchy, category_name, sub_category_name) line = '{}|{}'.format(hierarchy_name, sub_category_url) if 'Todo' in sub_category_name: urls_queue.put(line) print line
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: hierarchy name :param url: current page url :return: none :working: find the hierarchy of category which contains small_box_grid as a class name and adds links to the queue """ response = get_content(url) if response: category_container_1 = response.findAll('div', {'class': SMALL_BOX_GRID_CLASS}) # category_container_2 = response.findAll('div', {'class': LARGE_BOX_GRID_CLASS}) if len(category_container_1) != 0: for category in category_container_1: anchor_tag = category.find('a') if anchor_tag: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) # add links to the queue urls_queue.put(line)
def find_acs_nav_section(hierarchy, url): response = get_content(url) if response: category_container = response.find( 'div', {'class': ACS_WIDGET_LEFT_NAV_CLASS}) if category_container: category_list = category_container.findAll( 'div', {'class': ACS_SECTION_CLASS}) if len(category_list) != 0: for category in category_list: category_tag = category.find('button', {'class': 'acs-ln-header '}) if category_tag: category_name = string_format(category_tag) if category_name in IGNORE_LIST: continue else: sub_category_links = category.findAll('a') if len(sub_category_links) != 0: for sub_category_link in sub_category_links: sub_category_name = string_format( sub_category_link) sub_category_url = url_format( sub_category_link['href']) hierarchy_name = '{}|{}|{}'.format( hierarchy, category_name, sub_category_name) if 'Shopping_Tipps' in hierarchy_name or 'Spar_Abo' in hierarchy_name: pass else: find_traverse_type( hierarchy_name, sub_category_url, False) print(hierarchy_name)