Пример #1
0
def crawl_page(url, parent=None, depth=0):
    # check if already crawled
    if BotoFinished.objects.filter(url=url).exists():
        if parent:
            try:
                page = WikiPage.objects.get(
                    url_name=url_name_of(url))
                wl, exist = WikiList.objects.get_or_create(
                    url_name=parent)
                page.lists.add(wl)
                page.save()
            except:
                logger.warning('list not added')
        return
    else:
        BotoFinished(url=url).save()

    if can_access(url):
        logger.info('Starting to crawl {0}'.format(url))
        req = request.Request(
            url, data=None,
            headers={
                'User-Agent': settings.USER_AGENT,
            })
        f = request.urlopen(req)
        process_page(url, f.read(), parent, depth)

    else:
        logger.warning('Banned to access {0}'.format(url))
Пример #2
0
def process_list(url, title, raw_content, depth):
    if depth <= 0:
        logger.info('Reached depth limit: {0}'.format(title))
        return

    logger.info('Processing list {0}'.format(title))

    content = BeautifulSoup(
        bs_preprocess(raw_content), 'html.parser')

    url_name = url_name_of(url)
    WikiList.objects.get_or_create(
        url_name=url_name,
        defaults={'title': title})

    # remove noisy links
    link_list = [a for a in content.find_all('a')]
    for f in LINK_FILTERS:
        link_list = filter(f, link_list)

    # crawl all links of the list
    for link in link_list:
        abs_url = urljoin(url, link['href'])
        crawl_page.delay(abs_url, url_name, depth - 1)
Пример #3
0
def process_article(url, title, raw_content, parent=None):
    logger.info('Processing article {0}'.format(title))

    content = BeautifulSoup(
        bs_preprocess(raw_content), 'html.parser')

    body = BeautifulSoup(raw_content, 'html.parser')
    for edits in body.find_all(class_='mw-editsection'):
        edits.extract()
    body.find(id='jump-to-nav').extract()
    body = BeautifulSoup(str(body), 'html.parser')

    page, exist = WikiPage.objects.get_or_create(
        url_name=url_name_of(url),
        defaults={
            'title': title,
            'origin': url})
    page.body = str(body)
    page.save()

    # add lists
    if parent:
        wl, exist = WikiList.objects.get_or_create(
            url_name=parent)
        page.lists.add(wl)

    # add categories
    for cat in content.find(id='mw-normal-catlinks')\
            .find_all('li'):
        url_name = url_name_of_cat(cat.find('a')['href'])
        if not url_name:
            continue
        wc, exist = WikiCategory.objects.get_or_create(
            url_name=url_name,
            defaults={'title': str(cat.text)})
        page.categories.add(wc)