def crawl_page(url, parent=None, depth=0): # check if already crawled if BotoFinished.objects.filter(url=url).exists(): if parent: try: page = WikiPage.objects.get( url_name=url_name_of(url)) wl, exist = WikiList.objects.get_or_create( url_name=parent) page.lists.add(wl) page.save() except: logger.warning('list not added') return else: BotoFinished(url=url).save() if can_access(url): logger.info('Starting to crawl {0}'.format(url)) req = request.Request( url, data=None, headers={ 'User-Agent': settings.USER_AGENT, }) f = request.urlopen(req) process_page(url, f.read(), parent, depth) else: logger.warning('Banned to access {0}'.format(url))
def show(request, url_name): try: page = WikiPage.objects.get(url_name=url_name) return render( request, "wikipage/show.html", {"title": page.title, "body": page.body, "lists": page.lists, "categories": page.categories}, ) except ObjectDoesNotExist: url = "https://en.wikipedia.org/wiki/" + quote(url_name) if can_access(url): req = Request(url, data=None, headers={"User-Agent": settings.USER_AGENT}) response = urlopen(req).read() page = BeautifulSoup(response, "html.parser") title = str(page.find(id="firstHeading").string) body = page.find(id="bodyContent") for edits in body.find_all(class_="mw-editsection"): edits.extract() body.find(id="jump-to-nav").extract() return render(request, "wikipage/show.html", {"title": title, "body": str(body)}) else: return HttpResponseRedirect(url)