Пример #1
0
def collect_all_data(hierarchy, url, last_page, completed_set):
    """

       :param hierarchy: hierarchy name
       :param url: current page url
       :param last_page: last page number
       :param completed_set: completed url sets to compare
       :return: None
       :working: collects products url from all pages
       """
    url_list = []
    for pageNo in range(2, last_page):
        current_page = '{}&page={}'.format(url, pageNo)
        if in_completed_urls(current_page, completed_set):
            continue
        else:

            response = get_content(current_page)
            if response:
                product_url_tags = response.findAll(
                    'a', {
                        'class':
                        'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'
                    })
                if len(product_url_tags) != 0:
                    for product_url_tag in product_url_tags:
                        product_url = url_format(product_url_tag['href'])
                        line = '{}|{}'.format(hierarchy, product_url)
                        url_list.append(line)

                    print('{}|{}'.format(hierarchy, current_page))
                    update_files(hierarchy, url_list, current_page,
                                 PRODUCTS_INFO_FILE, COMPLETED_PAGE)
def get_product_urls(hierarchy_url):
    """

    :param hierarchy_url: string in this format 'hierarchy|page_url"
    :return: None
    :working : requsts the url and then find last page and call's traverse_page function
    """
    name_list = hierarchy_url.split('|')
    hierarchy_name = '|'.join(name_list[0:-1])
    page_url = name_list[-1]

    urls_list = []
    completed_path = '{}{}{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME,
                                             DataCollectors_Configuration.PATH_STYLE, hierarchy_name.replace('|', DataCollectors_Configuration.PATH_STYLE),
                                             DataCollectors_Configuration.PATH_STYLE, COMPLETED_PAGE)
    completed_set = file_to_set(completed_path)
    if in_completed_urls(page_url, completed_set):
        pass
    else:
        response = get_content(page_url)
        if response:

            product_url_tags = response.findAll('a', {
                'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'})
            if len(product_url_tags) != 0:
                for product_url_tag in product_url_tags:
                    product_url = url_format(product_url_tag['href'])
                    line = '{}|{}'.format(hierarchy_name, product_url)
                    urls_list.append(line)

                update_files(hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE)

                last_page = find_last_page(response)
                traverse_pages(hierarchy_name, page_url, last_page, completed_set)