Пример #1
0
def form_hierarchy(level):
    if level == 1:
        li = level_Dictionary[1][-1]
        name_ = li
        category_name = name_.replace('&', 'and').strip()
        category_name = re.sub('[^a-zA-Z|]', '_',
                               category_name).replace('__', '').strip()
        create_project_dir(project_name + '/' + category_name)

    elif level == 4:
        li = level_Dictionary[1][-1]
        li2 = level_Dictionary[4][-1]
        if li2 in url_dict.keys():
            name_ = li + '|' + li2
            category_name = name_.replace('&', 'and').strip()
            category_name = re.sub('[^a-zA-Z|]', '_',
                                   category_name).replace('__', '').strip()
            hierarchy_dict[category_name] = url_dict[li2]
            create_project_dir(project_name + '/' +
                               category_name.replace('|', '/'))

    elif level == 6:
        li = level_Dictionary[1][-1]
        li2 = level_Dictionary[4][-1]
        li3 = level_Dictionary[6][-1]
        if li3 in url_dict.keys():
            name_ = li + '|' + li2 + '|' + li3
            category_name = name_.replace('&', 'and').strip()
            category_name = re.sub('[^a-zA-Z|]', '_',
                                   category_name).replace('__', '').strip()
            hierarchy_dict[category_name] = url_dict[li3]
            create_project_dir(project_name + '/' +
                               category_name.replace('|', '/'))

    elif level == 7:
        li = level_Dictionary[1][-1]
        li2 = level_Dictionary[4][-1]
        li3 = level_Dictionary[7][-1]
        if li3 in url_dict.keys():
            name_ = li + '|' + li2 + '|' + li3
            category_name = name_.replace('&', 'and').strip()
            category_name = re.sub('[^a-zA-Z|]', '_',
                                   category_name).replace('__', '').strip()
            hierarchy_dict[category_name] = url_dict[li3]
            create_project_dir(project_name + '/' +
                               category_name.replace('|', '/'))

    elif level == 8:
        li = level_Dictionary[1][-1]
        li2 = level_Dictionary[4][-1]
        li3 = level_Dictionary[6][-1]
        li4 = level_Dictionary[8][-1]
        if li4 in url_dict.keys():
            name_ = li + '|' + li2 + '|' + li3 + '|' + li4
            category_name = name_.replace('&', 'and').strip()
            category_name = re.sub('[^a-zA-Z|]', '_',
                                   category_name).replace('__', '').strip()
            hierarchy_dict[category_name] = url_dict[li4]
            create_project_dir(project_name + '/' +
                               category_name.replace('|', '/'))
Пример #2
0
def create_hirerachy(starting_url):
    """

    :param starting_url: Give staring url
    :working: It will collect all urls with hierarchy and creates directories as present on website and writ into files
    :return: does not return anything
    """
    create_project_dir(PROJECT_NAME)
    category_hierarchy_and_urls = collect_main_page_urls(starting_url)
    urls_list = []

    # To create directories with hierarchy
    for category_name in sorted(category_hierarchy_and_urls.keys()):
        category_path = '{}/{}/{}'.format(ROOT_FOLDER,PROJECT_NAME, category_name.replace('|', '/'))
        # create_project_dir(category_path)

    # This will add dictionary elements into a list
    for key in category_hierarchy_and_urls.keys():
        line = '{}|{}'.format(key, category_hierarchy_and_urls[key])

        urls_list.append(line)
    # This function will write urls present in list to file
    list_to_file('queue_links', urls_list)
Пример #3
0
def collect_hirerachy_details():
    create_project_dir(project_name)
    page_soup, recivied_url = get_page_soup(
        DataCollectors_Configuration.SOUQ_MAIN_URL)
    columns_container = page_soup.find_all("div", {'class': 'large-4 columns'})

    # To split html page into different blocks so that we can get main category name
    categories_container = []
    for columns in columns_container:
        category_data = str(columns).split('<h3 class="shop-all-title">')
        for data in category_data:
            data = '<h3>' + data
            categories_container.append(data)

    # This method forms dictionary of last level sub_category as a key and its url
    form_url_dict(page_soup)

    # pasrse the categories container and perform depth first search then form hirerachy
    for i in categories_container:
        categories_block_container = BeautifulSoup(i, "html.parser")
        dfs(categories_block_container, 0, level_Dictionary)

    # Store hirerachy ditionary into file
    write_hirerachy_file(hierarchy_dict)