Пример #1
0
def process_list(url, title, raw_content, depth):
    if depth <= 0:
        logger.info('Reached depth limit: {0}'.format(title))
        return

    logger.info('Processing list {0}'.format(title))

    content = BeautifulSoup(
        bs_preprocess(raw_content), 'html.parser')

    url_name = url_name_of(url)
    WikiList.objects.get_or_create(
        url_name=url_name,
        defaults={'title': title})

    # remove noisy links
    link_list = [a for a in content.find_all('a')]
    for f in LINK_FILTERS:
        link_list = filter(f, link_list)

    # crawl all links of the list
    for link in link_list:
        abs_url = urljoin(url, link['href'])
        crawl_page.delay(abs_url, url_name, depth - 1)
Пример #2
0
def process_article(url, title, raw_content, parent=None):
    logger.info('Processing article {0}'.format(title))

    content = BeautifulSoup(
        bs_preprocess(raw_content), 'html.parser')

    body = BeautifulSoup(raw_content, 'html.parser')
    for edits in body.find_all(class_='mw-editsection'):
        edits.extract()
    body.find(id='jump-to-nav').extract()
    body = BeautifulSoup(str(body), 'html.parser')

    page, exist = WikiPage.objects.get_or_create(
        url_name=url_name_of(url),
        defaults={
            'title': title,
            'origin': url})
    page.body = str(body)
    page.save()

    # add lists
    if parent:
        wl, exist = WikiList.objects.get_or_create(
            url_name=parent)
        page.lists.add(wl)

    # add categories
    for cat in content.find(id='mw-normal-catlinks')\
            .find_all('li'):
        url_name = url_name_of_cat(cat.find('a')['href'])
        if not url_name:
            continue
        wc, exist = WikiCategory.objects.get_or_create(
            url_name=url_name,
            defaults={'title': str(cat.text)})
        page.categories.add(wc)