def handle(self, *args, **options): if args: raise CommandError('Usage is: %s' % cmd) self.port = options.get('port') if self.port: if not self.port.isdigit(): raise CommandError('%r is not a valid port number.' % self.port) else: self.port = int(self.port) count = 0 self.language = options.get('language') if self.language: translation.activate(self.language) for url in settings.HAYSTACK_STATIC_PAGES: if not url.startswith('http://'): if self.port: url = 'http://%s:%r%s' % (Site.objects.get_current().domain, self.port, reverse(url)) else: url = 'http://%s%s' % (Site.objects.get_current().domain, reverse(url)) print 'Analyzing %s...' % url try: page = StaticPage.objects.get(url=url) print '%s already exists in the index, updating...' % url except StaticPage.DoesNotExist: print '%s is new, adding...' % url page = StaticPage(url=url) pass try: html = urllib2.urlopen(url) except urllib2.URLError: print "Error while reading '%s'" % url continue soup = BeautifulSoup(html) try: page.title = escape(soup.head.title.string) except AttributeError: page.title = 'Untitled' meta = soup.find('meta', attrs={'name': 'description'}) if meta: page.description = meta.get('content', '') else: page.description = '' page.language = soup.html.get('lang', 'en') page.content = soup.prettify() page.save() count += 1 print 'Crawled %d static pages' % count
def handle(self, *args, **options): if args: raise CommandError('Usage is: %s' % self.cmd) self.port = options.get('port') if self.port: if not self.port.isdigit(): raise CommandError('%r is not a valid port number.' % self.port) else: self.port = int(self.port) count = 0 self.language = options.get('language') if self.language: translation.activate(self.language) StaticPage.objects.all().delete() for resource in settings.HAYSTACK_STATIC_PAGES: if resource.startswith('/') and os.path.isfile(resource): html = open(resource, 'r') url = None for key in settings.HAYSTACK_STATIC_MAPPING.keys(): if resource.startswith(key): tail = resource.split(key + '/')[1] head = settings.HAYSTACK_STATIC_MAPPING[key] url = u'%s%s' % (head, tail) else: if resource.startswith('http://'): url = resource else: if self.port: url = 'http://%s:%r%s' % ( Site.objects.get_current().domain, self.port, reverse(resource)) else: url = 'http://%s%s' % (Site.objects.get_current(). domain, reverse(resource)) try: html = urllib2.urlopen(url) except urllib2.URLError: print "Error while reading '%s'" % url continue print 'Analyzing %s...' % url try: page = StaticPage.objects.get(url=url) print '%s already exists in the index, updating...' % url except StaticPage.DoesNotExist: print '%s is new, adding...' % url page = StaticPage(url=url) soup = BeautifulSoup(html) try: page.title = escape(soup.head.title.string) except AttributeError: page.title = 'Untitled' meta = soup.find('meta', attrs={'name': 'description'}) if meta: page.description = meta.get('content', '') else: page.description = '' # save only body without scripts body = soup.find('body') [x.extract() for x in body.findAll('script')] page.content = body.text page.language = soup.html.get('lang') or self.language page.full_clean() page.save() count += 1 print 'Crawled %d static pages' % count
def handle(self, *args, **options): if args: raise CommandError('Usage is: %s' % self.cmd) self.port = options.get('port') if self.port: if not self.port.isdigit(): raise CommandError('%r is not a valid port number.' % self.port) else: self.port = int(self.port) count = 0 self.language = options.get('language') if self.language: translation.activate(self.language) StaticPage.objects.all().delete() for resource in settings.HAYSTACK_STATIC_PAGES: if resource.startswith('/') and os.path.isfile(resource): html = open(resource, 'r') url = None for key in settings.HAYSTACK_STATIC_MAPPING.keys(): if resource.startswith(key): tail = resource.split(key + '/')[1] head = settings.HAYSTACK_STATIC_MAPPING[key] url = u'%s%s' % (head, tail) else: if resource.startswith('http://'): url = resource else: if self.port: url = 'http://%s:%r%s' % (Site.objects.get_current().domain, self.port, reverse(resource)) else: url = 'http://%s%s' % (Site.objects.get_current().domain, reverse(resource)) try: html = urllib2.urlopen(url) except urllib2.URLError: print "Error while reading '%s'" % url continue print 'Analyzing %s...' % url try: page = StaticPage.objects.get(url=url) print '%s already exists in the index, updating...' % url except StaticPage.DoesNotExist: print '%s is new, adding...' % url page = StaticPage(url=url) soup = BeautifulSoup(html) try: page.title = escape(soup.head.title.string) except AttributeError: page.title = 'Untitled' meta = soup.find('meta', attrs={'name': 'description'}) if meta: page.description = meta.get('content', '') else: page.description = '' # save only body without scripts body = soup.find('body') [x.extract() for x in body.findAll('script')] page.content = body.text page.language = soup.html.get('lang') or self.language page.full_clean() page.save() count += 1 print 'Crawled %d static pages' % count
def handle(self, *args, **options): if args: raise CommandError('Usage is: %s' % cmd) self.port = options.get('port') if self.port: if not self.port.isdigit(): raise CommandError('%r is not a valid port number.' % self.port) else: self.port = int(self.port) count = 0 self.language = options.get('language') if self.language: translation.activate(self.language) # login login_url = '%s%s' % (settings.SERVER_URL, reverse(settings.HAYSTACK_STATIC_LOGIN_PAGE)) session = requests.Session() session.get(login_url) login_data = {} if hasattr(settings, 'HAYSTACK_STATIC_LOGIN_AUTH'): login_data = settings.HAYSTACK_STATIC_LOGIN_AUTH login_data.update( {'csrfmiddlewaretoken': session.cookies.get('csrftoken')}) session.post(login_url, data=login_data, cookies=session.cookies) for url in settings.HAYSTACK_STATIC_PAGES: if not url.startswith('http://'): if self.port: url = '%s:%r%s' % (settings.SERVER_URL, self.port, reverse(url)) else: url = '%s%s' % (settings.SERVER_URL, reverse(url)) print 'Analyzing %s...' % url try: page = StaticPage.objects.get(url=url) print '%s already exists in the index, updating...' % url except StaticPage.DoesNotExist: print '%s is new, adding...' % url page = StaticPage(url=url) pass try: html = session.get(url, cookies=session.cookies).content soup = BeautifulSoup(html, "html.parser") page_content = soup.find(class_='content').get_text() except Exception as e: print "Error while reading '%s:%s'" % (url, e) continue try: page.title = escape(soup.head.title.string) except AttributeError: page.title = 'Untitled' meta = soup.find('meta', attrs={'name': 'description'}) if meta: page.description = meta.get('content', '') else: page.description = '' page.language = soup.html.get('lang', 'en') page.content = page_content page.save() count += 1 print 'Crawled %d static pages' % count
def handle(self, *args, **options): cmd = 'crawl_static_pages [-p PORT] [-l LANG] [-u LIST OF URLs]' if args: raise CommandError('Usage is: %s' % cmd) self.port = options.get('port') if self.port: if not self.port.isdigit(): raise CommandError('%r is not a valid port number.' % self.port) else: self.port = int(self.port) count = 0 self.language = options.get('language') if self.language: translation.activate(self.language) urls_to_index = list( settings.HAYSTACK_STATIC_PAGES ) if options.get('urls'): urls_to_index.extend( options.get('urls') ) if options.get('names'): urls_to_index.extend( options.get('names') ) for url in urls_to_index: if not url.startswith('http://'): try: if self.port: url = 'http://%s:%r%s' % (Site.objects.get_current().domain, self.port, reverse(url)) else: url = 'http://%s%s' % (Site.objects.get_current().domain, reverse(url)) except NoReverseMatch: try: url = 'http://%s%s' % (Site.objects.get_current().domain, url) html = urllib2.urlopen(url) except: print 'No reverse match found for named url and is not valid url\n%s' % url continue print 'Analyzing %s...' % url if not hasattr( settings, 'HAYSTACK_STATIC_PAGES_STORE_REL_URL' ) or \ not settings.HAYSTACK_STATIC_PAGES_STORE_REL_URL: store_url = url else: store_url = urlparse.urlsplit(url).path try: page = StaticPage.objects.get(url=store_url) print '%s already exists in the index, updating...' % url except StaticPage.DoesNotExist: print '%s is new, adding...' % url page = StaticPage(url=store_url) pass try: html = urllib2.urlopen(url) except urllib2.URLError: print "Error while reading '%s'" % url continue soup = BeautifulSoup(html) try: page.title = escape(soup.head.title.string) except AttributeError: page.title = 'Untitled' meta = soup.find('meta', attrs={'name': 'description'}) if meta: page.description = meta.get('content', '') else: page.description = '' page.language = soup.html.get('lang', u'en-US') if options.get('strip_html') or hasattr( settings, 'HAYSTACK_STATIC_PAGES_STRIP_HTML') and settings.HAYSTACK_STATIC_PAGES_STRIP_HTML: # remove inline javascript [s.extract() for s in soup('script')] page.content = strip_tags(unicode(soup.body)) else: page.content = soup.prettify() page.save() count += 1 print 'Crawled %d static pages' % count
def handle(self, *args, **options): if args: raise CommandError('Usage is: %s' % cmd) self.port = options.get('port') if self.port: if not self.port.isdigit(): raise CommandError('%r is not a valid port number.' % self.port) else: self.port = int(self.port) count = 0 self.language = options.get('language') if self.language: translation.activate(self.language) for url in settings.HAYSTACK_STATIC_PAGES: if not url.startswith('http://'): if self.port: url = 'http://%s:%r%s' % ( Site.objects.get_current().domain, self.port, reverse(url)) else: url = 'http://%s%s' % (Site.objects.get_current().domain, reverse(url)) print 'Analyzing %s...' % url try: page = StaticPage.objects.get(url=url) print '%s already exists in the index, updating...' % url except StaticPage.DoesNotExist: print '%s is new, adding...' % url page = StaticPage(url=url) pass try: html = urllib2.urlopen(url) except urllib2.URLError: print "Error while reading '%s'" % url continue soup = BeautifulSoup(html) try: page.title = escape(soup.head.title.string) except AttributeError: page.title = 'Untitled' meta = soup.find('meta', attrs={'name': 'description'}) if meta: page.description = meta.get('content', '') else: page.description = '' page.language = soup.html.get('lang', 'en') page.content = soup.prettify() page.save() count += 1 print 'Crawled %d static pages' % count