Python get_content示例，response_getter.get_content Python示例

示例#1

0

显示文件

文件： sports_and_outdoors.py 项目： akhilreddyyeredla/Web_Crawlers

def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: last level hierarchy name
    :param url: current page url
    :return: None
    """

    response = get_content(url)
    if response:
        category_container = response.findAll('a', {'class': 'nav-a'})
        if len(category_container) != 0:
            for anchor_tag in category_container:
                # if anchor_tag.get('tabindex') == '66' or anchor_tag.get('tabindex') =='67':
                category_name = string_format(anchor_tag)
                if category_name in SELECTED_LIST:
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                    response = get_content(category_url)
                    category_response_type_1 = response.find(
                        'div', {'class': LEFT_NAV_CLASS})
                    category_response_type_2 = response.find(
                        'ol', {'class': CAROUSAL_CLASS})
                    if category_response_type_1:

                        find_nav_hierarchy(hierarchy_name, response)
                    elif category_response_type_2:

                        find_carousel_hierarchy(hierarchy_name, response)
            urls_queue.join()

示例#2

0

显示文件

def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: Hierarchy name
    :param url: current page url
    :return: NOne
    """
    response = get_content(url)
    if response:
        container = response.find('div', {'class': BOX_GRID_CONTAINER_CLASS})
        if container:
            category_containers = container.findAll(
                'div', {'class': SMALL_BOX_GRID_CLASS})
            if len(category_containers) != 0:
                for category in category_containers:
                    category_name = string_format(category.img['alt'])
                    category_url = url_format(category.find('a')['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    category_response = get_content(category_url)
                    """
                    :logic - At first we will find all possible class names from response 
                    and we will check which type of class got response and call that 
                    specific function to proceed further
                    """
                    if category_response:

                        category_response_type_1 = category_response.find(
                            "ul", {'class': INDENT_ONE_CLASS})
                        category_response_type_2 = category_response.findAll(
                            'div', {'class': SMALL_BOX_GRID_CLASS})
                        category_response_type_3 = category_response.find(
                            'div', {'class': LEFT_NAV_CLASS})
                        category_response_type_4 = category_response.find(
                            'ol', {'class': CAROUSAL_CLASS})

                        if category_response_type_1:
                            find_level_1_hierarchy(hierarchy_name,
                                                   category_response_type_1)

                        elif len(category_response_type_2) != 0:
                            find_box_grid_hierarchy(hierarchy_name,
                                                    category_response)

                        elif category_response_type_3:
                            find_left_nav_hierarchy(hierarchy_name,
                                                    category_response_type_3)

                        elif category_response_type_4:
                            find_carousel_hierarchy(hierarchy_name,
                                                    category_response)

                        else:
                            line = '{}|{}'.format(hierarchy_name, category_url)
                            urls_queue.put(line)

                urls_queue.join()

示例#3

0

显示文件

文件： sports_and_outdoor_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: hierarchy name
    :param url: current page url
    :return: none
    """

    resposne = get_content(url)
    if resposne:
        category_container_1 = resposne.find_all(
            'div', {'class': 'acs-ux-innerc1 acs-category-tile-links '})
        category_container_2 = resposne.find_all(
            'div', {'class': 'acs-ux-innerc2 acs-category-tile-links '})
        category_container_3 = resposne.find_all(
            'div', {'class': 'acs-ux-innerc3 acs-category-tile-links '})

        for category_container in category_container_1:
            collect_urls(hierarchy, category_container)

        for category_container in category_container_2:
            collect_urls(hierarchy, category_container)

        for category_container in category_container_3:
            collect_urls(hierarchy, category_container)

        # wait till all the urls complete in queue
        urls_queue.join()

示例#4

0

显示文件

文件： url_collector.py 项目： akhilreddyyeredla/Web_Crawlers

def get_product_urls(hierarchy_url):
    """

    :param hierarchy_url: string in this format 'hierarchy|page_url"
    :return: None
    :working : requsts the url and then find last page and call's traverse_page function
    """
    name_list = hierarchy_url.split('|')
    hierarchy_name = '|'.join(name_list[0:-1])

    page_url = name_list[-1]

    urls_list = []

    hierarchy_path = '/'.join(name_list[2:-1])
    completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_URL_ROOT,
                                     DataCollectors_Configuration.PATH_STYLE, hierarchy_path)

    response = get_content(page_url)

    if response:

        product_url_tags = response.find_all('div', {'class': 'catalogue-product row'})
        if len(product_url_tags) != 0:
            for product_url_tag in product_url_tags:
                anchor_tag = product_url_tag.find('a')
                product_url = '{}{}'.format(MAIN_URL,anchor_tag['href'])
                line = '{}|{}'.format(hierarchy_name, product_url)
                urls_list.append(line)

            update_files(completed_path, hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE)

示例#5

0

显示文件

def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: This function will find top_level of hierarchy and stores them in a queue and then starts Threadpool
    """
    response = get_content(url)

    if response:
        categories_container = response.find('div', {'class': LEFT_NAV_CLASS})

        if categories_container:
            anchor_tags = categories_container.findAll('a')
            if len(anchor_tags) != 0:
                for anchor_tag in anchor_tags:
                    categories_name = string_format(anchor_tag)
                    categories_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, categories_name)

                    line = '{}|{}'.format(hierarchy_name, categories_url)

                    urls_queue.put(line)
                create_workers()

示例#6

0

显示文件

def form_hierarchy(hierarchy, url):
    path = hierarchy.split('|')
    print 'Started {} Hierarchy Collection'.format(path[-1])
    start_time = time.time()
    page_container = response_getter.get_content(url)
    if page_container:
        content_containers_1 = page_container.find_all(
            'div', {'class': 'banner-layout-5'})
        content_containers_2 = page_container.find_all(
            'div', {'class': 'banner-layout-4'})
        content_containers_3 = page_container.find_all(
            'div', {'class': 'banner-layout-8'})
        content_containers_4 = page_container.find_all(
            'div', {'class': 'banner-layout-10'})

        if content_containers_1:
            first_level_hierarchy(hierarchy, content_containers_1)
        if content_containers_2:
            first_level_hierarchy(hierarchy, content_containers_2)
        if content_containers_3:
            first_level_hierarchy(hierarchy, content_containers_3)
        if content_containers_4:
            first_level_hierarchy(hierarchy, content_containers_4)

    end_time = time.time()

    total = end_time - start_time

    print '{} hierarchy collected |Started -> {} secs | Ended -> {} secs| Total -> {} secs '.format(
        path[-1], start_time, end_time, total)

示例#7

0

显示文件

文件： musique_dvd_et_blu_ray_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def find_nav_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    """
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            # for all nav_string find see more tag and hirarachy name and
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)
                        see_more_tag = nav_html.find("p",
                                                     {"class": SEE_MORE_CLASS})
                        if see_more_tag:
                            category_url = url_format(see_more_tag.a["href"])
                            hierarchy_name = '{}|{}'.format(
                                hierarchy, main_category_name)

                            # for current url find the traverse style as it was
                            find_traverse_type(hierarchy_name, category_url,
                                               False)

示例#8

0

显示文件

文件： electronics_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def find_traverse_type(hierarchy, url):
    response = get_content(url)
    if response:
        category_response_type_1 = response.find('div',
                                                 {'class': LEFT_NAV_CLASS})
        category_response_type_2 = response.find('ol',
                                                 {'class': CAROUSAL_CLASS})
        category_response_type_3 = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})
        category_response_type_4 = response.find(
            'div', {'class': ACS_WIDGET_LEFT_NAV_CLASS})

        if category_response_type_3:

            get_indent_two_hierarchy(hierarchy, response, url)

        elif category_response_type_4:

            find_acs_nav_section(hierarchy, response)

        elif category_response_type_1:

            find_nav_hierarchy(hierarchy, response)

        elif category_response_type_2:

            find_carousel_hierarchy(hierarchy, response)

示例#9

0

显示文件

文件： hogar_jardi_n_bricolaje_y_mascotas_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def find_hierarchy(hierarchy, url):
    """

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: recurssion function to find the hierarchy, last_page and products_page_url
    """
    response = get_content(url)
    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})

        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})

            # If length of anchor tags is not zero then it contains more categories
            if len(anchor_tags) != 0:
                # Now for each category_url again call find_hierarchy function
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    find_hierarchy(hierarchy_name, category_url)

            # If length of anchor tags is zero then it is the last level of hierarchy
            # Now create directory and save hierarchy and url in a file in that directory
            else:
                store_last_level_of_hierarchy(hierarchy, response, url)

示例#10

0

显示文件

def get_product_info(hierarchy_url):
    """

    :param hierarchy_url: string in this format 'hierarchy|page_url"
    :return: None
    :working : requsts the url and then find last page and call's traverse_page function
    """
    name_list = hierarchy_url.split('|')
    hierarchy_name = '|'.join(name_list[0:-1])

    page_url = name_list[-1]

    # urls_list = []

    hierarchy_path = '/'.join(name_list[2:-1])
    completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_INFO_ROOT,
                                     DataCollectors_Configuration.PATH_STYLE, hierarchy_path)

    response = get_content(page_url)

    if response:
        data, date, time = get_details(hierarchy_url, response)

        if data:
            store(MARKETPLACE, date, hierarchy_name, time, data)
            file_path = '{}/{}'.format(completed_path, COMPLETED_INFO_FILE)
            # print file_path
            append_to_file(file_path, hierarchy_url)

示例#11

0

显示文件

文件： clothes_shoes_and_watches_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def get_box_grid_hierarchy(hierarchy, url):
    """

    :param hierarchy: hierarchy name
    :param url: current page url
    :return: none
    :working: find the hierarchy of category which contains small_box_grid as a class name and adds links to the queue
    """

    response = get_content(url)
    if response:
        category_container_1 = response.findAll(
            'div', {'class': SMALL_BOX_GRID_CLASS})
        # category_container_2 = response.findAll('div', {'class': LARGE_BOX_GRID_CLASS})

        if len(category_container_1) != 0:
            for category in category_container_1:
                anchor_tag = category.find('a')
                if anchor_tag:
                    category_name = string_format(
                        str(anchor_tag.img['alt'].encode('utf-8')))
                    category_url = url_format(anchor_tag['href'])
                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                    line = '{}|{}'.format(hierarchy_name, category_url)
                    # add links to the queue

                    urls_queue.put(line)

示例#12

0

显示文件

文件： sports_fitness_and_outdoors_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: this function will find out 1st level of hierarchy  and adds link to the queue
    """
    response = get_content(url)

    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_ONE_CLASS})
        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})
            for anchor_tag in anchor_tags:
                category_name = string_format(anchor_tag)
                category_url = url_format(anchor_tag['href'])

                hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                line = '{}|{}'.format(hierarchy_name, category_url)

                urls_queue.put(line)
                # print(line)
            urls_queue.join()

示例#13

0

显示文件

def get_tree_hierarchy(main_category_name, url):
    '''

    :param main_category_name: Hierarchy name
    :param url: current_page_url
    :return:
    '''

    """
    This is the staring of the category hierarchy collection 
    for each category it will go recurvisely and find all sub_sub_categories
    and stores it in a hierarchy directory structure 
    """
    response = get_content(url)
    if response:
        category_container = response.find("ul", {'class': INDENT_NONE_CLASS})
        if category_container:
            sub_category_container = category_container.find("span", {'class': 'a-list-item'})
            if sub_category_container:
                sub_category_list = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS})
                for sub_category in sub_category_list:
                    sub_sub_category_name = string_format(sub_category)
                    hierarchy_name = '{}|{}'.format(main_category_name, sub_sub_category_name)
                    hierarchy_url = url_format(sub_category['href'])

                    line = '{}|{}'.format(hierarchy_name, hierarchy_url)

                    urls_queue.put(line)
                urls_queue.join()

示例#14

0

显示文件

def get_nav_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    :working : It will find the 1st level of hierarchy and add the links to the queue
    """
    response_container = get_content(url)
    if response_container:
        response = response_container.find('div', {'class': 'a-section a-spacing-base'})
        if response:

            nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
            if nav_container:

                nav_string = str(nav_container).split('<h3>')
                # for all nav_string find see more tag and hirarachy name and  store sub category url
                for nav in nav_string:
                    nav_html = BeautifulSoup(nav, 'lxml')
                    if nav_html:
                        category_container = nav_html.find('p')
                        if category_container:
                            main_category_name = string_format(category_container)
                            see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS})
                            if see_more_tag:
                                category_url = url_format(see_more_tag.a["href"])
                                hierarchy_name = '{}|{}'.format(hierarchy, main_category_name)
                                line = '{}|{}'.format(hierarchy_name, category_url)
                                urls_queue.put(line)
                                print line

示例#15

0

显示文件

文件： Amazon_main_category.py 项目： akhilreddyyeredla/Web_Crawlers

def collect_main_page_urls(main_url):
    """
    :param main_url: Home page url
    :return: dictionary with category_names as key and urls as values
    """
    '''
    get_content is function which takes url as input then parse it and returns html response back
    '''
    raw_data = get_content(main_url)
    category_name_and_url = {}  # Dictionary to store category names and urls
    if raw_data:
        category_containers = raw_data.findAll('div',
                                               {'class': 'popover-grouping'})
        for category in category_containers:
            category_name_tag = category.find(
                'h2', {'class': 'popover-category-name'})
            category_name = string_format(category_name_tag)
            urls_tag = category.findAll('a')
            for url in urls_tag:
                sub_category_1 = string_format(url)
                name_key = '{}|{}'.format(category_name, sub_category_1)
                url_value = url['href']
                # The url which we get does not contain domain name so we should concat the domain name
                url = '{}{}'.format(DOMAIN_NAME, url_value)
                category_name_and_url[name_key] = url

        return category_name_and_url
    else:
        print('got none')

示例#16

0

显示文件

文件： industria_empresas_y_ciencia_url_collector.py 项目： akhilreddyyeredla/Web_Crawlers

def get_product_urls(hierarchy_url):
    """

    :param hierarchy_url: string in this format 'hierarchy|page_url"
    :return: None
    :working : requsts the url and then find last page and call's traverse_page function
    """
    name_list = hierarchy_url.split('|')
    hierarchy_name = '|'.join(name_list[0:-1])
    page_url = name_list[-1]

    urls_list = []
    completed_path = '{}{}{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME,
                                             DataCollectors_Configuration.PATH_STYLE, hierarchy_name.replace('|', DataCollectors_Configuration.PATH_STYLE),
                                             DataCollectors_Configuration.PATH_STYLE, COMPLETED_PAGE)
    completed_set = file_to_set(completed_path)
    if in_completed_urls(page_url, completed_set):
        pass
    else:
        response = get_content(page_url)
        if response:

            product_url_tags = response.findAll('a', {
                'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'})
            if len(product_url_tags) != 0:
                for product_url_tag in product_url_tags:
                    product_url = url_format(product_url_tag['href'])
                    line = '{}|{}'.format(hierarchy_name, product_url)
                    urls_list.append(line)

                update_files(hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE)

                last_page = find_last_page(response)
                traverse_pages(hierarchy_name, page_url, last_page, completed_set)

示例#17

0

显示文件

def find_acs_nav_section(hierarchy, url):
    response = get_content(url)
    if response:
        category_container = response.find(
            'div', {'class': ACS_WIDGET_LEFT_NAV_CLASS})
        if category_container:
            category_list = category_container.findAll(
                'div', {'class': ACS_SECTION_CLASS})
            if len(category_list) != 0:
                for category in category_list:
                    category_tag = category.find('button',
                                                 {'class': 'acs-ln-header '})
                    if category_tag:

                        category_name = string_format(category_tag)
                        if category_name in IGNORE_LIST:
                            continue
                        else:
                            sub_category_links = category.findAll('a')
                            if len(sub_category_links) != 0:
                                for sub_category_link in sub_category_links:
                                    sub_category_name = string_format(
                                        sub_category_link)
                                    sub_category_url = url_format(
                                        sub_category_link['href'])

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, category_name,
                                        sub_category_name)
                                    if 'Tout' in hierarchy_name:
                                        find_traverse_type(
                                            hierarchy_name, sub_category_url,
                                            False)

示例#18

0

显示文件

def collect_all_data(hierarchy, url, last_page, completed_set):
    """

       :param hierarchy: hierarchy name
       :param url: current page url
       :param last_page: last page number
       :param completed_set: completed url sets to compare
       :return: None
       :working: collects products url from all pages
       """
    url_list = []
    for pageNo in range(2, last_page):
        current_page = '{}&page={}'.format(url, pageNo)
        if in_completed_urls(current_page, completed_set):
            continue
        else:

            response = get_content(current_page)
            if response:
                product_url_tags = response.findAll(
                    'a', {
                        'class':
                        'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'
                    })
                if len(product_url_tags) != 0:
                    for product_url_tag in product_url_tags:
                        product_url = url_format(product_url_tag['href'])
                        line = '{}|{}'.format(hierarchy, product_url)
                        url_list.append(line)

                    print('{}|{}'.format(hierarchy, current_page))
                    update_files(hierarchy, url_list, current_page,
                                 PRODUCTS_INFO_FILE, COMPLETED_PAGE)

示例#19

0

显示文件

文件： sports_fitness_and_outdoors_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def find_hierarchy(hierarchy, url):
    """

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: recurssion function to find the hierarchy, last_page and products_page_url
    """
    response = get_content(url)
    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})

        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})

            # If length of anchor tags is not zero then it contains more categories
            if len(anchor_tags) != 0:
                # Now for each category_url again call find_hierarchy function
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    find_hierarchy(hierarchy_name, category_url)

            # If length of anchor tags is zero then it is the last level of hierarchy
            # Now create directory and save hierarchy and url in a file in that directory
            else:
                h4_tag = response.find('h4', {'class': H4_TAG_CLASS})
                if h4_tag:
                    category_name = string_format(h4_tag)

                    category_url_tag = response.find('a',
                                                     {'title': LAYOUT_PICKER})

                    # To get tiles view url
                    if category_url_tag:
                        category_url = url_format(category_url_tag['href'])
                    else:
                        category_url = url

                    # Get hierarchy name
                    if category_name in hierarchy:
                        hierarchy_name = hierarchy
                    else:
                        hierarchy_name = '{}|{}'.format(
                            hierarchy, category_name)

                    line = '{}|{}'.format(hierarchy_name, category_url)
                    print line

                    # store the line in a file and create hierarchy directory
                    create_directory_and_hierarchy_files(hierarchy_name, line)

示例#20

0

显示文件

文件： elektronik_and_computer_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def get_nav_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current page url
    :return: none
    :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style
              and then adds that links to queue
    """
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:

                    category_container = nav_html.find('p')

                    if category_container:
                        main_category_name = string_format(category_container)
                        see_more_tag = nav_html.find("p",
                                                     {"class": SEE_MORE_CLASS})
                        if see_more_tag:
                            if main_category_name in 'Angebote_and_Aktionen|Kindle_Fire_and_Echo|Smartphones_and_mehr|Ratgeber_and_Services':
                                continue
                            else:
                                category_url = url_format(
                                    see_more_tag.a["href"])
                                hierarchy_name = '{}|{}'.format(
                                    hierarchy, main_category_name)

                                # for current url find the traverse style as it was
                                line = '{}|{}'.format(hierarchy_name,
                                                      category_url)
                                urls_queue.put(line)
                                # print line
                        else:
                            anchor_tags = nav_html.findAll('a')
                            if len(anchor_tags) != 0:
                                for anchor_tag in anchor_tags:
                                    sub_category_name = string_format(
                                        anchor_tag)
                                    if main_category_name in 'Angebote_and_Aktionen|Kindle_Fire_and_Echo|Smartphones_and_mehr|Ratgeber_and_Services':
                                        continue
                                    else:
                                        hierarchy_name = '{}|{}|{}'.format(
                                            hierarchy, main_category_name,
                                            sub_category_name)
                                        sub_category_url = url_format(
                                            anchor_tag['href'])

                                        line = '{}|{}'.format(
                                            hierarchy_name, sub_category_url)
                                        urls_queue.put(line)

示例#21

0

显示文件

文件： product_parser_1.py 项目： akhilreddyyeredla/Web_Crawlers

def check_and_get_seller_data(raw_data):
    seller_name_tag = raw_data.find('a')
    if seller_name_tag:
        seller_name = text_format(seller_name_tag)
        seller_link = url_format(seller_name_tag['href'])
        seller_raw_data = response_getter.get_content(seller_link)
        if seller_raw_data:
            return get_seller_info(seller_name, seller_raw_data)
        else:
            return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'
    else:
        seller_name = text_format(raw_data)
        if seller_name:
            return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'
    return 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'

示例#22

0

显示文件

文件： auto_e_moto_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def find_acs_nav_section(hierarchy, url):
    """

    :param hierarchy: hierarchy name
    :param url: current page url
    :return: None
    :Working: it will find out 1st level of hierarchy after that adds link into queue

    """
    response = get_content(url)
    if response:
        category_container = response.find(
            'div', {'class': 'a-section a-spacing-base'})
        if category_container:
            category_list = category_container.findAll(
                'div', {'class': ACS_SECTION_CLASS})
            if len(category_list) != 0:
                for category in category_list:
                    category_tag = category.find('div',
                                                 {'class': 'acs-ln-links'})
                    if category_tag:
                        category_name = string_format(category_tag)
                        if category_name in IGNORE_LIST:
                            continue
                        else:
                            sub_category_links = category.findAll('a')
                            if len(sub_category_links) != 0:
                                for sub_category_link in sub_category_links:
                                    sub_category_name = string_format(
                                        sub_category_link)
                                    sub_category_url = url_format(
                                        sub_category_link['href'])

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, category_name,
                                        sub_category_name)

                                    line = '{}|{}'.format(
                                        hierarchy_name, sub_category_url)
                                    if 'Tutto' in hierarchy_name:
                                        urls_queue.put(line)
                                    else:
                                        pass

示例#23

0

显示文件

文件： amazon_pantry_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def start_program():
    links = [
        'Amazon_Pantry|Baby_and_Child_Care|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_bc/262-6468249-9592357?ie=UTF8&node=8479375031',
        'Amazon_Pantry|Beer_Wine_and_Spirits|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_bws/262-6468249-9592357?ie=UTF8&node=8464529031',
        'Amazon_Pantry|Beverages|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_bv/262-6468249-9592357?ie=UTF8&node=5782664031',
        'Amazon_Pantry|Food_Cupboard|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_fc/262-6468249-9592357?ie=UTF8&node=5782663031',
        'Amazon_Pantry|Health_and_Beauty|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_hb/262-6468249-9592357?ie=UTF8&node=5790355031',
        'Amazon_Pantry|Household_Supplies|https://www.amazon.co.uk/b/ref=sd_allcat_prime_pantry_hs/262-6468249-9592357?ie=UTF8&node=5790354031',
        'Amazon_Pantry|Past_Purchases|https://www.amazon.co.uk/gp/pantry/past-purchases/ref=sd_allcat_prime_pantry_pp/262-6468249-9592357'
    ]

    for link in links:
        link_list = link.split('|')
        name = '|'.join(link_list[0:-1])
        url = link_list[-1]
        create_workers()
        resposne = get_content(url)
        if resposne:
            get_indent_two_hierarchy(name, resposne, url)

示例#24

0

显示文件

文件： auto_motorrad_and_gewerbe_info_collector.py 项目： akhilreddyyeredla/Web_Crawlers

def get_correct_data(hierarchy, url):
    """
    :param hierarchy: category hierarchy
    :param url: Current page Url
    :return: valid product details as a tuple
    """

    data = None
    for retires in range(0, CONSTANTS.MAX_RETRIES):
        raw_data = response_getter.get_content(url)
        # Raw_data is beautifulSoup object and it is passed through "get_data" to collect data
        if raw_data:
            # get product information as tuple
            data = get_data(raw_data, hierarchy, url)
            if data:
                break
            else:
                continue
    return data

示例#25

0

显示文件

文件： electronics_and_computers_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def find_traverse_type(hierarchy, url, flag):
    """

    :param hierarchy: category hierarchy
    :param url: current page utl
    :param flag: true or false  if it is true then the function was called for 1st time  if not then it is second time
    :return: None
    """
    response = get_content(url)
    if response:
        category_response_type_1 = response.find('div', {'class': LEFT_NAV_CLASS})
        category_response_type_3 = response.find('ul', {'class': INDENT_TWO_CLASS})

        if category_response_type_3:

            get_indent_two_hierarchy(hierarchy, response, url,flag=flag)

        elif category_response_type_1:

            find_nav_hierarchy(hierarchy, response)

示例#26

0

显示文件

文件： electronics_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def get_tree_hierarchy(hierarchy, url):
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)
                        if main_category_name in IGNORE_LIST:
                            pass
                        elif main_category_name in SELECTED_LIST:
                            get_level_1_see_more_hierarchy(hierarchy, nav_html)
                            # print(main_category_name)
                        else:
                            get_level_1_hierarchy(hierarchy, nav_html)

            urls_queue.join()

示例#27

0

显示文件

文件： sports_and_outdoors.py 项目： akhilreddyyeredla/Web_Crawlers

def find_traverse_type(hierarchy, url):
    response = get_content(url)
    if response:
        category_response_type_1 = response.find('div',
                                                 {'class': LEFT_NAV_CLASS})
        category_response_type_2 = response.find('ol',
                                                 {'class': CAROUSAL_CLASS})
        category_response_type_3 = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})

        if category_response_type_3:

            get_indent_two_hierarchy(hierarchy, response, url)

        elif category_response_type_1:

            get_level_1_see_more_hierarchy(hierarchy, response)

        elif category_response_type_2:

            find_carousel_hierarchy(hierarchy, response)

示例#28

0

显示文件

def find_nav_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    """
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            # for all nav_string find see more tag and hirarachy name and
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)

                        anchor_tags = nav_html.findAll('a')
                        if len(anchor_tags) != 0:
                            for anchor_tag in anchor_tags:
                                sub_category_name = string_format(anchor_tag)
                                if sub_category_name.replace(
                                        main_category_name, '') in IGNORE_LIST:
                                    continue
                                else:
                                    sub_category_url = url_format(
                                        anchor_tag['href'])

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, main_category_name,
                                        sub_category_name)
                                    if 'Jouets_par_cate_gorie' in hierarchy_name:
                                        print hierarchy_name
                                        find_traverse_type(
                                            hierarchy_name, sub_category_url,
                                            False)

示例#29

0

显示文件

文件： clothes_shoes_and_watches_hierarchy.py 项目： akhilreddyyeredla/Web_Crawlers

def get_tree_hierarchy(hierarchy, url):
    """

    :param hierarchy:hierarchy name
    :param url: current page url
    :return: none
    :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style
              and then adds that links to queue
    """
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:

                    category_container = nav_html.find('p')

                    if category_container:
                        main_category_name = string_format(category_container)
                        anchor_tags = nav_html.findAll('a')
                        if len(anchor_tags) != 0:
                            for anchor_tag in anchor_tags:
                                sub_category_name = string_format(anchor_tag)
                                if 'New_Arrivals' in main_category_name:
                                    continue
                                else:
                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, main_category_name,
                                        sub_category_name)
                                    sub_category_url = url_format(
                                        anchor_tag['href'])

                                    line = '{}|{}'.format(
                                        hierarchy_name, sub_category_url)
                                    urls_queue.put(line)

示例#30

0

显示文件

def find_hierarchy(hierarchy_name, url):
    """

    :param hierarchy_name:  hierachy names with pipe_delimited format
    :param url: current page url
    :return: None
    """
    response = get_content(url)
    if response:
        sub_category_container = find_sub_category_container(response)

        if sub_category_container:
            h4_tag = sub_category_container.find('h4', {'class': H4_TAG_CLASS})

            # If it contains <h4> tag then it is last level of hierarchy
            if h4_tag:
                anchor_tag = h4_tag.find('a')

                category_url = url_format(anchor_tag['href'])
                line = '{}|{}'.format(hierarchy_name, category_url)
                print(line)

                # stote the line in a file and create hierarchy directory
                create_directory_and_hierarchy_files(hierarchy_name, line)

            # else it contains more categories then find urls and again call this function
            else:
                anchor_tags = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS})
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])
                    hierarchy = '{}|{}'.format(hierarchy_name, category_name)

                    # recurvisely calling this function
                    find_hierarchy(hierarchy, category_url)

                    sub_category_urls.append(url_format(anchor_tag['href']))