def get_updated_links(queue_files, completed_files):
    queue_set = set()
    competed_set = set()

    for files in queue_files:

        count = 1
        for url in file_to_set(files):
            if DataCollectors_Configuration.PRODUCT_INFO_FLAG == CONSTANTS.PRODUCT_FlAG:
                # load only sample urls mentioned in DataCollectors_Configuration file
                if count <= DataCollectors_Configuration.NO_OF_PRODUCT_INFO_TO_COLLECT:
                    queue_set.add(url)
                    count = count + 1
                else:

                    break
            else:
                queue_set.add(url)

    for files in completed_files:
        count = 1
        for url in file_to_set(files):
            if DataCollectors_Configuration.PRODUCT_INFO_FLAG == CONSTANTS.PRODUCT_FlAG:
                # load only sample urls mentioned in DataCollectors_Configuration file
                if count <= DataCollectors_Configuration.NO_OF_PRODUCT_INFO_TO_COLLECT:
                    competed_set.add(url)
                    count = count + 1
                else:

                    break
            else:
                competed_set.add(url)

    final_set = queue_set - competed_set
    return final_set
def get_product_urls(hierarchy_url):
    """

    :param hierarchy_url: string in this format 'hierarchy|page_url"
    :return: None
    :working : requsts the url and then find last page and call's traverse_page function
    """
    name_list = hierarchy_url.split('|')
    hierarchy_name = '|'.join(name_list[0:-1])
    page_url = name_list[-1]

    urls_list = []
    completed_path = '{}{}{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME,
                                             DataCollectors_Configuration.PATH_STYLE, hierarchy_name.replace('|', DataCollectors_Configuration.PATH_STYLE),
                                             DataCollectors_Configuration.PATH_STYLE, COMPLETED_PAGE)
    completed_set = file_to_set(completed_path)
    if in_completed_urls(page_url, completed_set):
        pass
    else:
        response = get_content(page_url)
        if response:

            product_url_tags = response.findAll('a', {
                'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'})
            if len(product_url_tags) != 0:
                for product_url_tag in product_url_tags:
                    product_url = url_format(product_url_tag['href'])
                    line = '{}|{}'.format(hierarchy_name, product_url)
                    urls_list.append(line)

                update_files(hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE)

                last_page = find_last_page(response)
                traverse_pages(hierarchy_name, page_url, last_page, completed_set)
def start_url_collection(folder_name):
    """

    :param folder_name: category_folder name
    :return: None
    """
    path = '{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME, DataCollectors_Configuration.PATH_STYLE,
                               folder_name)
    all_files = get_all_files(path, DataCollectors_Configuration.PATTERN_1)
    for files in all_files:
        if os.path.exists(files):
            for url in file_to_set(files):
                urls_queue.put(url)
    urls_queue.join()