def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: last level hierarchy name :param url: current page url :return: None """ response = get_content(url) if response: category_container = response.findAll('a', {'class': 'nav-a'}) if len(category_container) != 0: for anchor_tag in category_container: # if anchor_tag.get('tabindex') == '66' or anchor_tag.get('tabindex') =='67': category_name = string_format(anchor_tag) if category_name in SELECTED_LIST: category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) response = get_content(category_url) category_response_type_1 = response.find( 'div', {'class': LEFT_NAV_CLASS}) category_response_type_2 = response.find( 'ol', {'class': CAROUSAL_CLASS}) if category_response_type_1: find_nav_hierarchy(hierarchy_name, response) elif category_response_type_2: find_carousel_hierarchy(hierarchy_name, response) urls_queue.join()
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: Hierarchy name :param url: current page url :return: NOne """ response = get_content(url) if response: container = response.find('div', {'class': BOX_GRID_CONTAINER_CLASS}) if container: category_containers = container.findAll( 'div', {'class': SMALL_BOX_GRID_CLASS}) if len(category_containers) != 0: for category in category_containers: category_name = string_format(category.img['alt']) category_url = url_format(category.find('a')['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) category_response = get_content(category_url) """ :logic - At first we will find all possible class names from response and we will check which type of class got response and call that specific function to proceed further """ if category_response: category_response_type_1 = category_response.find( "ul", {'class': INDENT_ONE_CLASS}) category_response_type_2 = category_response.findAll( 'div', {'class': SMALL_BOX_GRID_CLASS}) category_response_type_3 = category_response.find( 'div', {'class': LEFT_NAV_CLASS}) category_response_type_4 = category_response.find( 'ol', {'class': CAROUSAL_CLASS}) if category_response_type_1: find_level_1_hierarchy(hierarchy_name, category_response_type_1) elif len(category_response_type_2) != 0: find_box_grid_hierarchy(hierarchy_name, category_response) elif category_response_type_3: find_left_nav_hierarchy(hierarchy_name, category_response_type_3) elif category_response_type_4: find_carousel_hierarchy(hierarchy_name, category_response) else: line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) urls_queue.join()
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: hierarchy name :param url: current page url :return: none """ resposne = get_content(url) if resposne: category_container_1 = resposne.find_all( 'div', {'class': 'acs-ux-innerc1 acs-category-tile-links '}) category_container_2 = resposne.find_all( 'div', {'class': 'acs-ux-innerc2 acs-category-tile-links '}) category_container_3 = resposne.find_all( 'div', {'class': 'acs-ux-innerc3 acs-category-tile-links '}) for category_container in category_container_1: collect_urls(hierarchy, category_container) for category_container in category_container_2: collect_urls(hierarchy, category_container) for category_container in category_container_3: collect_urls(hierarchy, category_container) # wait till all the urls complete in queue urls_queue.join()
def get_product_urls(hierarchy_url): """ :param hierarchy_url: string in this format 'hierarchy|page_url" :return: None :working : requsts the url and then find last page and call's traverse_page function """ name_list = hierarchy_url.split('|') hierarchy_name = '|'.join(name_list[0:-1]) page_url = name_list[-1] urls_list = [] hierarchy_path = '/'.join(name_list[2:-1]) completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_URL_ROOT, DataCollectors_Configuration.PATH_STYLE, hierarchy_path) response = get_content(page_url) if response: product_url_tags = response.find_all('div', {'class': 'catalogue-product row'}) if len(product_url_tags) != 0: for product_url_tag in product_url_tags: anchor_tag = product_url_tag.find('a') product_url = '{}{}'.format(MAIN_URL,anchor_tag['href']) line = '{}|{}'.format(hierarchy_name, product_url) urls_list.append(line) update_files(completed_path, hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE)
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: category_hierarchy name :param url: current_page_url :return: None :working: This function will find top_level of hierarchy and stores them in a queue and then starts Threadpool """ response = get_content(url) if response: categories_container = response.find('div', {'class': LEFT_NAV_CLASS}) if categories_container: anchor_tags = categories_container.findAll('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: categories_name = string_format(anchor_tag) categories_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, categories_name) line = '{}|{}'.format(hierarchy_name, categories_url) urls_queue.put(line) create_workers()
def form_hierarchy(hierarchy, url): path = hierarchy.split('|') print 'Started {} Hierarchy Collection'.format(path[-1]) start_time = time.time() page_container = response_getter.get_content(url) if page_container: content_containers_1 = page_container.find_all( 'div', {'class': 'banner-layout-5'}) content_containers_2 = page_container.find_all( 'div', {'class': 'banner-layout-4'}) content_containers_3 = page_container.find_all( 'div', {'class': 'banner-layout-8'}) content_containers_4 = page_container.find_all( 'div', {'class': 'banner-layout-10'}) if content_containers_1: first_level_hierarchy(hierarchy, content_containers_1) if content_containers_2: first_level_hierarchy(hierarchy, content_containers_2) if content_containers_3: first_level_hierarchy(hierarchy, content_containers_3) if content_containers_4: first_level_hierarchy(hierarchy, content_containers_4) end_time = time.time() total = end_time - start_time print '{} hierarchy collected |Started -> {} secs | Ended -> {} secs| Total -> {} secs '.format( path[-1], start_time, end_time, total)
def find_nav_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current url :return: None """ response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') # for all nav_string find see more tag and hirarachy name and for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS}) if see_more_tag: category_url = url_format(see_more_tag.a["href"]) hierarchy_name = '{}|{}'.format( hierarchy, main_category_name) # for current url find the traverse style as it was find_traverse_type(hierarchy_name, category_url, False)
def find_traverse_type(hierarchy, url): response = get_content(url) if response: category_response_type_1 = response.find('div', {'class': LEFT_NAV_CLASS}) category_response_type_2 = response.find('ol', {'class': CAROUSAL_CLASS}) category_response_type_3 = response.find('ul', {'class': INDENT_TWO_CLASS}) category_response_type_4 = response.find( 'div', {'class': ACS_WIDGET_LEFT_NAV_CLASS}) if category_response_type_3: get_indent_two_hierarchy(hierarchy, response, url) elif category_response_type_4: find_acs_nav_section(hierarchy, response) elif category_response_type_1: find_nav_hierarchy(hierarchy, response) elif category_response_type_2: find_carousel_hierarchy(hierarchy, response)
def find_hierarchy(hierarchy, url): """ :param hierarchy: category_hierarchy name :param url: current_page_url :return: None :working: recurssion function to find the hierarchy, last_page and products_page_url """ response = get_content(url) if response: sub_categories_container = response.find('ul', {'class': INDENT_TWO_CLASS}) if sub_categories_container: anchor_tags = sub_categories_container.findAll( 'a', {'class': NORMAL_ANCHOR_TAG_CLASS}) # If length of anchor tags is not zero then it contains more categories if len(anchor_tags) != 0: # Now for each category_url again call find_hierarchy function for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) find_hierarchy(hierarchy_name, category_url) # If length of anchor tags is zero then it is the last level of hierarchy # Now create directory and save hierarchy and url in a file in that directory else: store_last_level_of_hierarchy(hierarchy, response, url)
def get_product_info(hierarchy_url): """ :param hierarchy_url: string in this format 'hierarchy|page_url" :return: None :working : requsts the url and then find last page and call's traverse_page function """ name_list = hierarchy_url.split('|') hierarchy_name = '|'.join(name_list[0:-1]) page_url = name_list[-1] # urls_list = [] hierarchy_path = '/'.join(name_list[2:-1]) completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_INFO_ROOT, DataCollectors_Configuration.PATH_STYLE, hierarchy_path) response = get_content(page_url) if response: data, date, time = get_details(hierarchy_url, response) if data: store(MARKETPLACE, date, hierarchy_name, time, data) file_path = '{}/{}'.format(completed_path, COMPLETED_INFO_FILE) # print file_path append_to_file(file_path, hierarchy_url)
def get_box_grid_hierarchy(hierarchy, url): """ :param hierarchy: hierarchy name :param url: current page url :return: none :working: find the hierarchy of category which contains small_box_grid as a class name and adds links to the queue """ response = get_content(url) if response: category_container_1 = response.findAll( 'div', {'class': SMALL_BOX_GRID_CLASS}) # category_container_2 = response.findAll('div', {'class': LARGE_BOX_GRID_CLASS}) if len(category_container_1) != 0: for category in category_container_1: anchor_tag = category.find('a') if anchor_tag: category_name = string_format( str(anchor_tag.img['alt'].encode('utf-8'))) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) # add links to the queue urls_queue.put(line)
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy: category_hierarchy name :param url: current_page_url :return: None :working: this function will find out 1st level of hierarchy and adds link to the queue """ response = get_content(url) if response: sub_categories_container = response.find('ul', {'class': INDENT_ONE_CLASS}) if sub_categories_container: anchor_tags = sub_categories_container.findAll( 'a', {'class': NORMAL_ANCHOR_TAG_CLASS}) for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) # print(line) urls_queue.join()
def get_tree_hierarchy(main_category_name, url): ''' :param main_category_name: Hierarchy name :param url: current_page_url :return: ''' """ This is the staring of the category hierarchy collection for each category it will go recurvisely and find all sub_sub_categories and stores it in a hierarchy directory structure """ response = get_content(url) if response: category_container = response.find("ul", {'class': INDENT_NONE_CLASS}) if category_container: sub_category_container = category_container.find("span", {'class': 'a-list-item'}) if sub_category_container: sub_category_list = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS}) for sub_category in sub_category_list: sub_sub_category_name = string_format(sub_category) hierarchy_name = '{}|{}'.format(main_category_name, sub_sub_category_name) hierarchy_url = url_format(sub_category['href']) line = '{}|{}'.format(hierarchy_name, hierarchy_url) urls_queue.put(line) urls_queue.join()
def get_nav_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current url :return: None :working : It will find the 1st level of hierarchy and add the links to the queue """ response_container = get_content(url) if response_container: response = response_container.find('div', {'class': 'a-section a-spacing-base'}) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') # for all nav_string find see more tag and hirarachy name and store sub category url for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS}) if see_more_tag: category_url = url_format(see_more_tag.a["href"]) hierarchy_name = '{}|{}'.format(hierarchy, main_category_name) line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) print line
def collect_main_page_urls(main_url): """ :param main_url: Home page url :return: dictionary with category_names as key and urls as values """ ''' get_content is function which takes url as input then parse it and returns html response back ''' raw_data = get_content(main_url) category_name_and_url = {} # Dictionary to store category names and urls if raw_data: category_containers = raw_data.findAll('div', {'class': 'popover-grouping'}) for category in category_containers: category_name_tag = category.find( 'h2', {'class': 'popover-category-name'}) category_name = string_format(category_name_tag) urls_tag = category.findAll('a') for url in urls_tag: sub_category_1 = string_format(url) name_key = '{}|{}'.format(category_name, sub_category_1) url_value = url['href'] # The url which we get does not contain domain name so we should concat the domain name url = '{}{}'.format(DOMAIN_NAME, url_value) category_name_and_url[name_key] = url return category_name_and_url else: print('got none')
def get_product_urls(hierarchy_url): """ :param hierarchy_url: string in this format 'hierarchy|page_url" :return: None :working : requsts the url and then find last page and call's traverse_page function """ name_list = hierarchy_url.split('|') hierarchy_name = '|'.join(name_list[0:-1]) page_url = name_list[-1] urls_list = [] completed_path = '{}{}{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME, DataCollectors_Configuration.PATH_STYLE, hierarchy_name.replace('|', DataCollectors_Configuration.PATH_STYLE), DataCollectors_Configuration.PATH_STYLE, COMPLETED_PAGE) completed_set = file_to_set(completed_path) if in_completed_urls(page_url, completed_set): pass else: response = get_content(page_url) if response: product_url_tags = response.findAll('a', { 'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'}) if len(product_url_tags) != 0: for product_url_tag in product_url_tags: product_url = url_format(product_url_tag['href']) line = '{}|{}'.format(hierarchy_name, product_url) urls_list.append(line) update_files(hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE) last_page = find_last_page(response) traverse_pages(hierarchy_name, page_url, last_page, completed_set)
def find_acs_nav_section(hierarchy, url): response = get_content(url) if response: category_container = response.find( 'div', {'class': ACS_WIDGET_LEFT_NAV_CLASS}) if category_container: category_list = category_container.findAll( 'div', {'class': ACS_SECTION_CLASS}) if len(category_list) != 0: for category in category_list: category_tag = category.find('button', {'class': 'acs-ln-header '}) if category_tag: category_name = string_format(category_tag) if category_name in IGNORE_LIST: continue else: sub_category_links = category.findAll('a') if len(sub_category_links) != 0: for sub_category_link in sub_category_links: sub_category_name = string_format( sub_category_link) sub_category_url = url_format( sub_category_link['href']) hierarchy_name = '{}|{}|{}'.format( hierarchy, category_name, sub_category_name) if 'Tout' in hierarchy_name: find_traverse_type( hierarchy_name, sub_category_url, False)
def collect_all_data(hierarchy, url, last_page, completed_set): """ :param hierarchy: hierarchy name :param url: current page url :param last_page: last page number :param completed_set: completed url sets to compare :return: None :working: collects products url from all pages """ url_list = [] for pageNo in range(2, last_page): current_page = '{}&page={}'.format(url, pageNo) if in_completed_urls(current_page, completed_set): continue else: response = get_content(current_page) if response: product_url_tags = response.findAll( 'a', { 'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal' }) if len(product_url_tags) != 0: for product_url_tag in product_url_tags: product_url = url_format(product_url_tag['href']) line = '{}|{}'.format(hierarchy, product_url) url_list.append(line) print('{}|{}'.format(hierarchy, current_page)) update_files(hierarchy, url_list, current_page, PRODUCTS_INFO_FILE, COMPLETED_PAGE)
def find_hierarchy(hierarchy, url): """ :param hierarchy: category_hierarchy name :param url: current_page_url :return: None :working: recurssion function to find the hierarchy, last_page and products_page_url """ response = get_content(url) if response: sub_categories_container = response.find('ul', {'class': INDENT_TWO_CLASS}) if sub_categories_container: anchor_tags = sub_categories_container.findAll( 'a', {'class': NORMAL_ANCHOR_TAG_CLASS}) # If length of anchor tags is not zero then it contains more categories if len(anchor_tags) != 0: # Now for each category_url again call find_hierarchy function for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy_name = '{}|{}'.format(hierarchy, category_name) find_hierarchy(hierarchy_name, category_url) # If length of anchor tags is zero then it is the last level of hierarchy # Now create directory and save hierarchy and url in a file in that directory else: h4_tag = response.find('h4', {'class': H4_TAG_CLASS}) if h4_tag: category_name = string_format(h4_tag) category_url_tag = response.find('a', {'title': LAYOUT_PICKER}) # To get tiles view url if category_url_tag: category_url = url_format(category_url_tag['href']) else: category_url = url # Get hierarchy name if category_name in hierarchy: hierarchy_name = hierarchy else: hierarchy_name = '{}|{}'.format( hierarchy, category_name) line = '{}|{}'.format(hierarchy_name, category_url) print line # store the line in a file and create hierarchy directory create_directory_and_hierarchy_files(hierarchy_name, line)
def get_nav_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current page url :return: none :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style and then adds that links to queue """ response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS}) if see_more_tag: if main_category_name in 'Angebote_and_Aktionen|Kindle_Fire_and_Echo|Smartphones_and_mehr|Ratgeber_and_Services': continue else: category_url = url_format( see_more_tag.a["href"]) hierarchy_name = '{}|{}'.format( hierarchy, main_category_name) # for current url find the traverse style as it was line = '{}|{}'.format(hierarchy_name, category_url) urls_queue.put(line) # print line else: anchor_tags = nav_html.findAll('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: sub_category_name = string_format( anchor_tag) if main_category_name in 'Angebote_and_Aktionen|Kindle_Fire_and_Echo|Smartphones_and_mehr|Ratgeber_and_Services': continue else: hierarchy_name = '{}|{}|{}'.format( hierarchy, main_category_name, sub_category_name) sub_category_url = url_format( anchor_tag['href']) line = '{}|{}'.format( hierarchy_name, sub_category_url) urls_queue.put(line)
def check_and_get_seller_data(raw_data): seller_name_tag = raw_data.find('a') if seller_name_tag: seller_name = text_format(seller_name_tag) seller_link = url_format(seller_name_tag['href']) seller_raw_data = response_getter.get_content(seller_link) if seller_raw_data: return get_seller_info(seller_name, seller_raw_data) else: return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available' else: seller_name = text_format(raw_data) if seller_name: return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available' return 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'
def find_acs_nav_section(hierarchy, url): """ :param hierarchy: hierarchy name :param url: current page url :return: None :Working: it will find out 1st level of hierarchy after that adds link into queue """ response = get_content(url) if response: category_container = response.find( 'div', {'class': 'a-section a-spacing-base'}) if category_container: category_list = category_container.findAll( 'div', {'class': ACS_SECTION_CLASS}) if len(category_list) != 0: for category in category_list: category_tag = category.find('div', {'class': 'acs-ln-links'}) if category_tag: category_name = string_format(category_tag) if category_name in IGNORE_LIST: continue else: sub_category_links = category.findAll('a') if len(sub_category_links) != 0: for sub_category_link in sub_category_links: sub_category_name = string_format( sub_category_link) sub_category_url = url_format( sub_category_link['href']) hierarchy_name = '{}|{}|{}'.format( hierarchy, category_name, sub_category_name) line = '{}|{}'.format( hierarchy_name, sub_category_url) if 'Tutto' in hierarchy_name: urls_queue.put(line) else: pass
def start_program(): links = [ 'Amazon_Pantry|Baby_and_Child_Care|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_bc/262-6468249-9592357?ie=UTF8&node=8479375031', 'Amazon_Pantry|Beer_Wine_and_Spirits|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_bws/262-6468249-9592357?ie=UTF8&node=8464529031', 'Amazon_Pantry|Beverages|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_bv/262-6468249-9592357?ie=UTF8&node=5782664031', 'Amazon_Pantry|Food_Cupboard|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_fc/262-6468249-9592357?ie=UTF8&node=5782663031', 'Amazon_Pantry|Health_and_Beauty|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_hb/262-6468249-9592357?ie=UTF8&node=5790355031', 'Amazon_Pantry|Household_Supplies|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_hs/262-6468249-9592357?ie=UTF8&node=5790354031', 'Amazon_Pantry|Past_Purchases|https://www.amazon.co.uk/gp/pantry/past-purchases/ref=sd_allcat_prime_pantry_pp/262-6468249-9592357' ] for link in links: link_list = link.split('|') name = '|'.join(link_list[0:-1]) url = link_list[-1] create_workers() resposne = get_content(url) if resposne: get_indent_two_hierarchy(name, resposne, url)
def get_correct_data(hierarchy, url): """ :param hierarchy: category hierarchy :param url: Current page Url :return: valid product details as a tuple """ data = None for retires in range(0, CONSTANTS.MAX_RETRIES): raw_data = response_getter.get_content(url) # Raw_data is beautifulSoup object and it is passed through "get_data" to collect data if raw_data: # get product information as tuple data = get_data(raw_data, hierarchy, url) if data: break else: continue return data
def find_traverse_type(hierarchy, url, flag): """ :param hierarchy: category hierarchy :param url: current page utl :param flag: true or false if it is true then the function was called for 1st time if not then it is second time :return: None """ response = get_content(url) if response: category_response_type_1 = response.find('div', {'class': LEFT_NAV_CLASS}) category_response_type_3 = response.find('ul', {'class': INDENT_TWO_CLASS}) if category_response_type_3: get_indent_two_hierarchy(hierarchy, response, url,flag=flag) elif category_response_type_1: find_nav_hierarchy(hierarchy, response)
def get_tree_hierarchy(hierarchy, url): response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) if main_category_name in IGNORE_LIST: pass elif main_category_name in SELECTED_LIST: get_level_1_see_more_hierarchy(hierarchy, nav_html) # print(main_category_name) else: get_level_1_hierarchy(hierarchy, nav_html) urls_queue.join()
def find_traverse_type(hierarchy, url): response = get_content(url) if response: category_response_type_1 = response.find('div', {'class': LEFT_NAV_CLASS}) category_response_type_2 = response.find('ol', {'class': CAROUSAL_CLASS}) category_response_type_3 = response.find('ul', {'class': INDENT_TWO_CLASS}) if category_response_type_3: get_indent_two_hierarchy(hierarchy, response, url) elif category_response_type_1: get_level_1_see_more_hierarchy(hierarchy, response) elif category_response_type_2: find_carousel_hierarchy(hierarchy, response)
def find_nav_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current url :return: None """ response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') # for all nav_string find see more tag and hirarachy name and for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) anchor_tags = nav_html.findAll('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: sub_category_name = string_format(anchor_tag) if sub_category_name.replace( main_category_name, '') in IGNORE_LIST: continue else: sub_category_url = url_format( anchor_tag['href']) hierarchy_name = '{}|{}|{}'.format( hierarchy, main_category_name, sub_category_name) if 'Jouets_par_cate_gorie' in hierarchy_name: print hierarchy_name find_traverse_type( hierarchy_name, sub_category_url, False)
def get_tree_hierarchy(hierarchy, url): """ :param hierarchy:hierarchy name :param url: current page url :return: none :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style and then adds that links to queue """ response = get_content(url) if response: nav_container = response.find('div', {'class': LEFT_NAV_CLASS}) if nav_container: nav_string = str(nav_container).split('<h3>') for nav in nav_string: nav_html = BeautifulSoup(nav, 'lxml') if nav_html: category_container = nav_html.find('p') if category_container: main_category_name = string_format(category_container) anchor_tags = nav_html.findAll('a') if len(anchor_tags) != 0: for anchor_tag in anchor_tags: sub_category_name = string_format(anchor_tag) if 'New_Arrivals' in main_category_name: continue else: hierarchy_name = '{}|{}|{}'.format( hierarchy, main_category_name, sub_category_name) sub_category_url = url_format( anchor_tag['href']) line = '{}|{}'.format( hierarchy_name, sub_category_url) urls_queue.put(line)
def find_hierarchy(hierarchy_name, url): """ :param hierarchy_name: hierachy names with pipe_delimited format :param url: current page url :return: None """ response = get_content(url) if response: sub_category_container = find_sub_category_container(response) if sub_category_container: h4_tag = sub_category_container.find('h4', {'class': H4_TAG_CLASS}) # If it contains <h4> tag then it is last level of hierarchy if h4_tag: anchor_tag = h4_tag.find('a') category_url = url_format(anchor_tag['href']) line = '{}|{}'.format(hierarchy_name, category_url) print(line) # stote the line in a file and create hierarchy directory create_directory_and_hierarchy_files(hierarchy_name, line) # else it contains more categories then find urls and again call this function else: anchor_tags = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS}) for anchor_tag in anchor_tags: category_name = string_format(anchor_tag) category_url = url_format(anchor_tag['href']) hierarchy = '{}|{}'.format(hierarchy_name, category_name) # recurvisely calling this function find_hierarchy(hierarchy, category_url) sub_category_urls.append(url_format(anchor_tag['href']))