def get_box_grid_hierarchy(hierarchy, url):
    """

    :param hierarchy: hierarchy name
    :param url: current page url
    :return: none
    :working: find the hierarchy of category which contains small_box_grid as a class name and adds links to the queue
    """

    response = get_content(url)
    if response:
        category_container_1 = response.findAll('div', {'class': SMALL_BOX_GRID_CLASS})
        # category_container_2 = response.findAll('div', {'class': LARGE_BOX_GRID_CLASS})

        if len(category_container_1) != 0:
            for category in category_container_1:
                anchor_tag = category.find('a')
                if anchor_tag:
                    category_name = string_format(str(anchor_tag.img['alt'].encode('utf-8')))
                    category_url = url_format(anchor_tag['href'])
                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                    line = '{}|{}'.format(hierarchy_name, category_url)
                    # add links to the queue

                    urls_queue.put(line)
def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: this function will find out 1st level of hierarchy  and adds link to the queue
    """
    response = get_content(url)

    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_ONE_CLASS})
        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})
            for anchor_tag in anchor_tags:
                category_name = string_format(anchor_tag)
                category_url = url_format(anchor_tag['href'])

                hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                line = '{}|{}'.format(hierarchy_name, category_url)

                urls_queue.put(line)
                # print(line)
            urls_queue.join()
Пример #3
0
def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: This function will find top_level of hierarchy and stores them in a queue and then starts Threadpool
    """
    response = get_content(url)

    if response:
        categories_container = response.find('div', {'class': LEFT_NAV_CLASS})

        if categories_container:
            anchor_tags = categories_container.findAll('a')
            if len(anchor_tags) != 0:
                for anchor_tag in anchor_tags:
                    categories_name = string_format(anchor_tag)
                    categories_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, categories_name)

                    line = '{}|{}'.format(hierarchy_name, categories_url)

                    urls_queue.put(line)
                create_workers()
def store_last_level_of_hierarchy(hierarchy, page_soup, url):
    response = page_soup
    if response:
        h4_tag = response.find('h4', {'class': H4_TAG_CLASS})
        if h4_tag:
            category_name = string_format(h4_tag)

            category_url_tag = response.find('a', {'title': LAYOUT_PICKER})

            # To get tiles view url
            if category_url_tag:
                category_url = url_format(category_url_tag['href'])
            else:
                category_url = url

            # Get hierarchy name
            if category_name in hierarchy:
                hierarchy_name = hierarchy
            else:
                hierarchy_name = '{}|{}'.format(hierarchy, category_name)

            line = '{}|{}'.format(hierarchy_name, category_url)
            print line

            # store the line in a file and create hierarchy directory
            create_directory_and_hierarchy_files(hierarchy_name, line)
def find_hierarchy(hierarchy, url):
    """

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: recurssion function to find the hierarchy, last_page and products_page_url
    """
    response = get_content(url)
    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})

        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})

            # If length of anchor tags is not zero then it contains more categories
            if len(anchor_tags) != 0:
                # Now for each category_url again call find_hierarchy function
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    find_hierarchy(hierarchy_name, category_url)

            # If length of anchor tags is zero then it is the last level of hierarchy
            # Now create directory and save hierarchy and url in a file in that directory
            else:
                store_last_level_of_hierarchy(hierarchy, response, url)
def find_nav_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    """
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            # for all nav_string find see more tag and hirarachy name and
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)
                        see_more_tag = nav_html.find("p",
                                                     {"class": SEE_MORE_CLASS})
                        if see_more_tag:
                            category_url = url_format(see_more_tag.a["href"])
                            hierarchy_name = '{}|{}'.format(
                                hierarchy, main_category_name)

                            # for current url find the traverse style as it was
                            find_traverse_type(hierarchy_name, category_url,
                                               False)
def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: last level hierarchy name
    :param url: current page url
    :return: None
    """

    response = get_content(url)
    if response:
        category_container = response.findAll('a', {'class': 'nav-a'})
        if len(category_container) != 0:
            for anchor_tag in category_container:
                # if anchor_tag.get('tabindex') == '66' or anchor_tag.get('tabindex') =='67':
                category_name = string_format(anchor_tag)
                if category_name in SELECTED_LIST:
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                    response = get_content(category_url)
                    category_response_type_1 = response.find(
                        'div', {'class': LEFT_NAV_CLASS})
                    category_response_type_2 = response.find(
                        'ol', {'class': CAROUSAL_CLASS})
                    if category_response_type_1:

                        find_nav_hierarchy(hierarchy_name, response)
                    elif category_response_type_2:

                        find_carousel_hierarchy(hierarchy_name, response)
            urls_queue.join()
def get_indent_two_hierarchy(hierarchy, page_soup, url):
    """

    :param url: Current Page Url
    :param hierarchy: hierarchy_name
    :param page_soup: BeautifulSoup response
    :return: None
    """
    response = page_soup
    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})

        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})

            # If length of anchor tags is not zero then it contains more categories
            if len(anchor_tags) != 0:
                # Now for each category_url again call find_hierarchy function
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    line = '{}|{}'.format(hierarchy_name, category_url)
                    urls_queue.put(line)

            # If length of anchor tags is zero then it is the last level of hierarchy
            # Now create directory and save hierarchy and url in a file in that directory
            else:
                store_last_level_of_hierarchy(hierarchy, response, url)
def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current page url
    :return: none
    :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style
              and then adds that links to queue
    """
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:

                    category_container = nav_html.find('p')

                    if category_container:
                        main_category_name = string_format(category_container)
                        see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS})
                        if see_more_tag:
                            if 'Neuheiten' in main_category_name or 'Shops' in main_category_name:
                                continue
                            else:
                                category_url = url_format(see_more_tag.a["href"])
                                hierarchy_name = '{}|{}'.format(hierarchy, main_category_name)

                                # for current url find the traverse style as it was
                                line = '{}|{}'.format(hierarchy_name, category_url)
                                urls_queue.put(line)
                                # print line
                        else:
                            anchor_tags = nav_html.findAll('a')
                            if len(anchor_tags) != 0:
                                for anchor_tag in anchor_tags:
                                    sub_category_name = string_format(anchor_tag)
                                    if 'Neuheiten' in main_category_name or 'Shops' in main_category_name:
                                        continue
                                    else:
                                        hierarchy_name = '{}|{}|{}'.format(hierarchy, main_category_name, sub_category_name)
                                        sub_category_url = url_format(anchor_tag['href'])

                                        line = '{}|{}'.format(hierarchy_name, sub_category_url)
                                        urls_queue.put(line)
Пример #10
0
def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: Hierarchy name
    :param url: current page url
    :return: NOne
    """
    response = get_content(url)
    if response:
        container = response.find('div', {'class': BOX_GRID_CONTAINER_CLASS})
        if container:
            category_containers = container.findAll(
                'div', {'class': SMALL_BOX_GRID_CLASS})
            if len(category_containers) != 0:
                for category in category_containers:
                    category_name = string_format(category.img['alt'])
                    category_url = url_format(category.find('a')['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    category_response = get_content(category_url)
                    """
                    :logic - At first we will find all possible class names from response 
                    and we will check which type of class got response and call that 
                    specific function to proceed further
                    """
                    if category_response:

                        category_response_type_1 = category_response.find(
                            "ul", {'class': INDENT_ONE_CLASS})
                        category_response_type_2 = category_response.findAll(
                            'div', {'class': SMALL_BOX_GRID_CLASS})
                        category_response_type_3 = category_response.find(
                            'div', {'class': LEFT_NAV_CLASS})
                        category_response_type_4 = category_response.find(
                            'ol', {'class': CAROUSAL_CLASS})

                        if category_response_type_1:
                            find_level_1_hierarchy(hierarchy_name,
                                                   category_response_type_1)

                        elif len(category_response_type_2) != 0:
                            find_box_grid_hierarchy(hierarchy_name,
                                                    category_response)

                        elif category_response_type_3:
                            find_left_nav_hierarchy(hierarchy_name,
                                                    category_response_type_3)

                        elif category_response_type_4:
                            find_carousel_hierarchy(hierarchy_name,
                                                    category_response)

                        else:
                            line = '{}|{}'.format(hierarchy_name, category_url)
                            urls_queue.put(line)

                urls_queue.join()
def find_carousal_sub_categories(hierarchy, page_soup, anchor_tag):
    """

    :param hierarchy: Hierarchy name
    :param page_soup: BeautifulSoup response
    :param anchor_tag: anchor tag to get category name and url
    :return: none
    :working: It wll find all the sub categories under carousal division and add them to url list
    """
    category_name = string_format(anchor_tag)
    category_id = anchor_tag['id']
    if category_name in IGNORE_LIST:
        pass
    else:
        sub_cat_id = 'sub{}'.format(category_id)
        sub_category_container = page_soup.find('div', {'id': sub_cat_id})
        if sub_category_container:
            sub_category_list = sub_category_container.findAll(
                'a', {'class': SUB_CATEGORY_LIST_ANCHOR_TAG_CLASS})
            if len(sub_category_list) != 0:
                for sub_category in sub_category_list:
                    sub_category_name = string_format(sub_category)
                    sub_category_url = url_format(sub_category['href'])
                    if sub_category_name.replace(category_name,
                                                 '') in IGNORE_LIST:
                        # If it is in ignore list skip that link
                        pass
                    else:
                        hierarchy_name = '{}|{}|{}'.format(
                            hierarchy, category_name, sub_category_name)
                        line = '{}|{}'.format(hierarchy_name, sub_category_url)

                        # add links to the queue
                        print(line)
                        urls_queue.put(line)

        else:

            sub_category_url = url_format(anchor_tag['href'])
            hierarchy_name = '{}|{}'.format(hierarchy, category_name)
            line = '{}|{}'.format(hierarchy_name, sub_category_url)

            # add links to the queue
            urls_queue.put(line)
def find_acs_nav_section(hierarchy, url):
    """

    :param hierarchy: hierarchy name
    :param url: current page url
    :return: None
    :Working: it will find out 1st level of hierarchy after that adds link into queue

    """
    response = get_content(url)
    if response:
        category_container = response.find(
            'div', {'class': 'a-section a-spacing-base'})
        if category_container:
            category_list = category_container.findAll(
                'div', {'class': ACS_SECTION_CLASS})
            if len(category_list) != 0:
                for category in category_list:
                    category_tag = category.find('div',
                                                 {'class': 'acs-ln-links'})
                    if category_tag:
                        category_name = string_format(category_tag)
                        if category_name in IGNORE_LIST:
                            continue
                        else:
                            sub_category_links = category.findAll('a')
                            if len(sub_category_links) != 0:
                                for sub_category_link in sub_category_links:
                                    sub_category_name = string_format(
                                        sub_category_link)
                                    sub_category_url = url_format(
                                        sub_category_link['href'])

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, category_name,
                                        sub_category_name)

                                    line = '{}|{}'.format(
                                        hierarchy_name, sub_category_url)
                                    if 'Tutto' in hierarchy_name:
                                        urls_queue.put(line)
                                    else:
                                        pass
def get_level_1_hierarchy(hierarchy, page_soup):
    response = page_soup
    if response:
        category_name = response.find('p')
        if category_name:
            main_category_name = string_format(category_name)
            anchor_tags = response.findAll('a')
            if len(anchor_tags) != 0:
                for anchor_tag in anchor_tags:
                    sub_category_name = string_format(anchor_tag)
                    if sub_category_name.replace(main_category_name,
                                                 '') in IGNORE_LIST:
                        continue
                    else:
                        sub_category_url = url_format(anchor_tag['href'])

                        hierarchy_name = '{}|{}|{}'.format(
                            hierarchy, main_category_name, sub_category_name)

                        find_traverse_type(hierarchy_name, sub_category_url)
def get_level_1_see_more_hierarchy(hierarchy, page_soup):
    response = page_soup
    if response:
        see_more_tags = response.findAll("p", {"class": SEE_MORE_CLASS})
        if len(see_more_tags) != 0:
            for see_more_tag in see_more_tags:
                category_name = string_format(str(see_more_tag.text[7:]))

                category_url = url_format(see_more_tag.a["href"])

                hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                find_traverse_type(hierarchy_name, category_url)
Пример #15
0
def find_nav_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    """
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            # for all nav_string find see more tag and hirarachy name and
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)

                        anchor_tags = nav_html.findAll('a')
                        if len(anchor_tags) != 0:
                            for anchor_tag in anchor_tags:
                                sub_category_name = string_format(anchor_tag)
                                if sub_category_name.replace(
                                        main_category_name, '') in IGNORE_LIST:
                                    continue
                                else:
                                    sub_category_url = url_format(
                                        anchor_tag['href'])

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, main_category_name,
                                        sub_category_name)
                                    if 'Jouets_par_cate_gorie' in hierarchy_name:
                                        print hierarchy_name
                                        find_traverse_type(
                                            hierarchy_name, sub_category_url,
                                            False)
Пример #16
0
def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current page url
    :return: none
    :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style
              and then adds that links to queue
    """
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:

                    category_container = nav_html.find('p')

                    if category_container:
                        main_category_name = string_format(category_container)
                        anchor_tags = nav_html.findAll('a')
                        if len(anchor_tags) != 0:
                            for anchor_tag in anchor_tags:
                                sub_category_name = string_format(anchor_tag)
                                if 'Shop_by_Genre' in main_category_name:
                                    hierarchy_name = '{}|{}'.format(
                                        hierarchy, sub_category_name)
                                    sub_category_url = url_format(
                                        anchor_tag['href'])

                                    line = '{}|{}'.format(
                                        hierarchy_name, sub_category_url)
                                    urls_queue.put(line)

                                else:
                                    continue
def find_acs_nav_section(hierarchy, url):
    response = get_content(url)
    if response:
        category_container = response.find(
            'div', {'class': ACS_WIDGET_LEFT_NAV_CLASS})
        if category_container:
            category_list = category_container.findAll(
                'div', {'class': ACS_SECTION_CLASS})
            if len(category_list) != 0:
                for category in category_list:
                    category_tag = category.find('button',
                                                 {'class': 'acs-ln-header '})
                    if category_tag:

                        category_name = string_format(category_tag)
                        if category_name in IGNORE_LIST:
                            continue
                        else:
                            sub_category_links = category.findAll('a')
                            if len(sub_category_links) != 0:
                                for sub_category_link in sub_category_links:
                                    sub_category_name = string_format(
                                        sub_category_link)
                                    sub_category_url = url_format(
                                        sub_category_link['href'])

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, category_name,
                                        sub_category_name)
                                    if 'Shopping_Tipps' in hierarchy_name or 'Spar_Abo' in hierarchy_name:
                                        pass
                                    else:
                                        find_traverse_type(
                                            hierarchy_name, sub_category_url,
                                            False)
                                        print(hierarchy_name)
Пример #18
0
def find_acs_nav_section(hierarchy, url):
    """

    :param hierarchy: HIerarchy name
    :param url: current page url
    :return: NOne
    :Working: This function find 1st level for hierarchy which contains acs widhget as class name and adds the link into queue
    """
    response = get_content(url)
    if response:
        category_container = response.find('div', {'class': ACS_WIDGET_LEFT_NAV_CLASS})

        if category_container:

            category_list = category_container.findAll('div', {'class': ACS_SECTION_CLASS})
            if len(category_list) != 0:

                for category in category_list:
                    category_tag = category.find('button', {'class': 'acs-ln-header '})

                    if category_tag:
                        category_name = string_format(category_tag)
                        if category_name in IGNORE_LIST:
                            continue
                        else:
                            sub_category_links = category.findAll('a')
                            if len(sub_category_links) != 0:
                                for sub_category_link in sub_category_links:
                                    sub_category_name = string_format(sub_category_link)
                                    sub_category_url = url_format(sub_category_link['href'])

                                    hierarchy_name = '{}|{}|{}'.format(hierarchy, category_name, sub_category_name)
                                    line = '{}|{}'.format(hierarchy_name, sub_category_url)
                                    if 'Todo' in sub_category_name:
                                        urls_queue.put(line)
                                        print line
Пример #19
0
def find_level_1_hierarchy(hierarchy, page_soup):
    category_container = page_soup
    if category_container:
        sub_category_container = category_container.find(
            "span", {'class': 'a-list-item'})
        if sub_category_container:
            sub_category_list = sub_category_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})
            for sub_category in sub_category_list:
                sub_category_name = string_format(sub_category)
                hierarchy_name = '{}|{}'.format(hierarchy, sub_category_name)
                hierarchy_url = url_format(sub_category['href'])

                line = '{}|{}'.format(hierarchy_name, hierarchy_url)
                # add links to the queue
                urls_queue.put(line)
def collect_urls(hierarchy, raw_data):
    category_container = raw_data
    if category_container:
        category_name_tag = category_container.find(
            'div', {'class': 'acs-category-tile-header'})
        category_url_tag = category_container.find(
            'div', {'class': 'acs-category-tile-shopall '})
        category_urls_list = category_container.findAll('li')
        if category_name_tag:
            category_name = string_format(category_name_tag)
            hierarchy_name = '{}|{}'.format(hierarchy, category_name)
            if category_url_tag:
                category_url = url_format(category_url_tag.a['href'])
                line = '{}|{}'.format(hierarchy_name, category_url)
                urls_queue.put(line)
            else:
                for category_url_tag in category_urls_list:
                    category_url = url_format(category_url_tag.a['href'])
                    line = '{}|{}'.format(hierarchy_name, category_url)
                    urls_queue.put(line)
def find_nav_hierarchy(hierarchy, page_soup):
    response = page_soup
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)
                        see_more_tag = nav_html.find("p",
                                                     {"class": SEE_MORE_CLASS})
                        if see_more_tag:
                            category_url = url_format(see_more_tag.a["href"])
                            hierarchy_name = '{}|{}'.format(
                                hierarchy, main_category_name)
                            line = '{}|{}'.format(hierarchy_name, category_url)
                            urls_queue.put(line)
def get_tree_hierarchy(hierarchy, url):
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)
                        if main_category_name in IGNORE_LIST:
                            pass
                        elif main_category_name in SELECTED_LIST:
                            get_level_1_see_more_hierarchy(hierarchy, nav_html)
                            # print(main_category_name)
                        else:
                            get_level_1_hierarchy(hierarchy, nav_html)

            urls_queue.join()
Пример #23
0
def find_left_nav_hierarchy(hierarchy, page_soup):
    category_container = page_soup
    toggle = True
    if category_container:
        category_list = category_container.findAll('ul')
        if len(category_list) != 0:
            for category in category_list:
                if toggle:
                    toggle = False
                else:
                    category_tag = category.find('a')
                    if category_tag:
                        category_name = string_format(category_tag)
                        category_url = url_format(category_tag['href'])

                        hierarchy_name = '{}|{}'.format(
                            hierarchy, category_name)
                        line = '{}|{}'.format(hierarchy_name, category_url)

                        # add links to the queue
                        urls_queue.put(line)
Пример #24
0
def get_nav_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    :working : It will find the 1st level of hierarchy and add the links to the queue
    """
    response_container = get_content(url)
    if response_container:
        response = response_container.find(
            'div', {'class': 'a-section a-spacing-base'})
        if response:

            nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
            if nav_container:

                nav_string = str(nav_container).split('<h3>')
                # for all nav_string find see more tag and hirarachy name and  store sub category url
                for nav in nav_string:
                    nav_html = BeautifulSoup(nav, 'lxml')
                    if nav_html:
                        category_container = nav_html.find('p')
                        if category_container:
                            main_category_name = string_format(
                                category_container)
                            see_more_tag = nav_html.find(
                                "p", {"class": SEE_MORE_CLASS})
                            if see_more_tag:
                                category_url = url_format(
                                    see_more_tag.a["href"])
                                hierarchy_name = '{}|{}'.format(
                                    hierarchy, main_category_name)
                                line = '{}|{}'.format(hierarchy_name,
                                                      category_url)
                                urls_queue.put(line)
                                print line
Пример #25
0
def find_hierarchy(hierarchy_name, url):
    """

    :param hierarchy_name:  hierachy names with pipe_delimited format
    :param url: current page url
    :return: None
    """
    response = get_content(url)
    if response:
        sub_category_container = find_sub_category_container(response)

        if sub_category_container:
            h4_tag = sub_category_container.find('h4', {'class': H4_TAG_CLASS})

            # If it contains <h4> tag then it is last level of hierarchy
            if h4_tag:
                anchor_tag = h4_tag.find('a')

                category_url = url_format(anchor_tag['href'])
                line = '{}|{}'.format(hierarchy_name, category_url)
                print(line)

                # stote the line in a file and create hierarchy directory
                create_directory_and_hierarchy_files(hierarchy_name, line)

            # else it contains more categories then find urls and again call this function
            else:
                anchor_tags = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS})
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])
                    hierarchy = '{}|{}'.format(hierarchy_name, category_name)

                    # recurvisely calling this function
                    find_hierarchy(hierarchy, category_url)

                    sub_category_urls.append(url_format(anchor_tag['href']))