def handle(self, *args, **options):
        if args:
            raise CommandError('Usage is: %s' % cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' % self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)
        
        for url in settings.HAYSTACK_STATIC_PAGES:
            if not url.startswith('http://'):
                if self.port:
                    url = 'http://%s:%r%s' % (Site.objects.get_current().domain, self.port, reverse(url))
                else:
                    url = 'http://%s%s' % (Site.objects.get_current().domain, reverse(url))
            
            print 'Analyzing %s...' % url
            
            try:
                page = StaticPage.objects.get(url=url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=url)
                pass
            
            try:
                html = urllib2.urlopen(url)
            except urllib2.URLError:
                print "Error while reading '%s'" % url
                continue
            
            soup = BeautifulSoup(html)
            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''
            page.language = soup.html.get('lang', 'en')
            page.content = soup.prettify()
            page.save()
            count += 1

        print 'Crawled %d static pages' % count
Exemplo n.º 2
0
    def handle(self, *args, **options):
        if args:
            raise CommandError('Usage is: %s' % self.cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' %
                                   self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)

        StaticPage.objects.all().delete()

        for resource in settings.HAYSTACK_STATIC_PAGES:
            if resource.startswith('/') and os.path.isfile(resource):
                html = open(resource, 'r')
                url = None
                for key in settings.HAYSTACK_STATIC_MAPPING.keys():
                    if resource.startswith(key):
                        tail = resource.split(key + '/')[1]
                        head = settings.HAYSTACK_STATIC_MAPPING[key]
                        url = u'%s%s' % (head, tail)
            else:
                if resource.startswith('http://'):
                    url = resource
                else:
                    if self.port:
                        url = 'http://%s:%r%s' % (
                            Site.objects.get_current().domain, self.port,
                            reverse(resource))
                    else:
                        url = 'http://%s%s' % (Site.objects.get_current().
                                               domain, reverse(resource))

                try:
                    html = urllib2.urlopen(url)
                except urllib2.URLError:
                    print "Error while reading '%s'" % url
                    continue

            print 'Analyzing %s...' % url

            try:
                page = StaticPage.objects.get(url=url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=url)

            soup = BeautifulSoup(html)
            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''

            # save only body without scripts
            body = soup.find('body')
            [x.extract() for x in body.findAll('script')]
            page.content = body.text

            page.language = soup.html.get('lang') or self.language

            page.full_clean()
            page.save()
            count += 1

        print 'Crawled %d static pages' % count
    def handle(self, *args, **options):
        if args:
            raise CommandError('Usage is: %s' % self.cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' % self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)

        StaticPage.objects.all().delete()

        for resource in settings.HAYSTACK_STATIC_PAGES:
            if resource.startswith('/') and os.path.isfile(resource):
                html = open(resource, 'r')
                url = None
                for key in settings.HAYSTACK_STATIC_MAPPING.keys():
                    if resource.startswith(key):
                        tail = resource.split(key + '/')[1]
                        head = settings.HAYSTACK_STATIC_MAPPING[key]
                        url = u'%s%s' % (head, tail)
            else:
                if resource.startswith('http://'):
                    url = resource
                else:
                    if self.port:
                        url = 'http://%s:%r%s' % (Site.objects.get_current().domain, self.port, reverse(resource))
                    else:
                        url = 'http://%s%s' % (Site.objects.get_current().domain, reverse(resource))

                try:
                    html = urllib2.urlopen(url)
                except urllib2.URLError:
                    print "Error while reading '%s'" % url
                    continue

            print 'Analyzing %s...' % url

            try:
                page = StaticPage.objects.get(url=url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=url)

            soup = BeautifulSoup(html)
            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''

            # save only body without scripts
            body = soup.find('body')
            [x.extract() for x in body.findAll('script')]
            page.content = body.text

            page.language = soup.html.get('lang') or self.language

            page.full_clean()
            page.save()
            count += 1

        print 'Crawled %d static pages' % count
    def handle(self, *args, **options):
        if args:
            raise CommandError('Usage is: %s' % cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' %
                                   self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)

        # login
        login_url = '%s%s' % (settings.SERVER_URL,
                              reverse(settings.HAYSTACK_STATIC_LOGIN_PAGE))
        session = requests.Session()
        session.get(login_url)

        login_data = {}
        if hasattr(settings, 'HAYSTACK_STATIC_LOGIN_AUTH'):
            login_data = settings.HAYSTACK_STATIC_LOGIN_AUTH
            login_data.update(
                {'csrfmiddlewaretoken': session.cookies.get('csrftoken')})

        session.post(login_url, data=login_data, cookies=session.cookies)

        for url in settings.HAYSTACK_STATIC_PAGES:

            if not url.startswith('http://'):
                if self.port:
                    url = '%s:%r%s' % (settings.SERVER_URL, self.port,
                                       reverse(url))
                else:
                    url = '%s%s' % (settings.SERVER_URL, reverse(url))

            print 'Analyzing %s...' % url

            try:
                page = StaticPage.objects.get(url=url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=url)
                pass

            try:
                html = session.get(url, cookies=session.cookies).content
                soup = BeautifulSoup(html, "html.parser")
                page_content = soup.find(class_='content').get_text()
            except Exception as e:
                print "Error while reading '%s:%s'" % (url, e)
                continue

            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''
            page.language = soup.html.get('lang', 'en')
            page.content = page_content
            page.save()
            count += 1

        print 'Crawled %d static pages' % count
    def handle(self, *args, **options):
        cmd = 'crawl_static_pages [-p PORT] [-l LANG] [-u LIST OF URLs]'
        if args:
            raise CommandError('Usage is: %s' % cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' % self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)

        urls_to_index = list( settings.HAYSTACK_STATIC_PAGES )

        if options.get('urls'): urls_to_index.extend( options.get('urls') )
        if options.get('names'): urls_to_index.extend( options.get('names') )

        for url in urls_to_index:
            if not url.startswith('http://'):
                try:
                    if self.port:
                        url = 'http://%s:%r%s' % (Site.objects.get_current().domain, self.port, reverse(url))
                    else:
                        url = 'http://%s%s' % (Site.objects.get_current().domain, reverse(url))
                except NoReverseMatch:
                    try:
                        url = 'http://%s%s' % (Site.objects.get_current().domain, url)
                        html = urllib2.urlopen(url)
                    except:
                        print 'No reverse match found for named url and is not valid url\n%s' % url
                        continue

            print 'Analyzing %s...' % url

            if not hasattr( settings, 'HAYSTACK_STATIC_PAGES_STORE_REL_URL' ) or \
                not settings.HAYSTACK_STATIC_PAGES_STORE_REL_URL:
                store_url = url
            else:
                store_url = urlparse.urlsplit(url).path

            try:
                page = StaticPage.objects.get(url=store_url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=store_url)
                pass

            try:
                html = urllib2.urlopen(url)
            except urllib2.URLError:
                print "Error while reading '%s'" % url
                continue

            soup = BeautifulSoup(html)
            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''
            page.language = soup.html.get('lang', u'en-US')
            if options.get('strip_html') or hasattr( settings, 'HAYSTACK_STATIC_PAGES_STRIP_HTML') and settings.HAYSTACK_STATIC_PAGES_STRIP_HTML:
                # remove inline javascript
                [s.extract() for s in soup('script')]
                page.content = strip_tags(unicode(soup.body))
            else:
                page.content = soup.prettify()
            page.save()
            count += 1

        print 'Crawled %d static pages' % count
Exemplo n.º 6
0
    def handle(self, *args, **options):
        if args:
            raise CommandError('Usage is: %s' % cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' %
                                   self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)

        for url in settings.HAYSTACK_STATIC_PAGES:
            if not url.startswith('http://'):
                if self.port:
                    url = 'http://%s:%r%s' % (
                        Site.objects.get_current().domain, self.port,
                        reverse(url))
                else:
                    url = 'http://%s%s' % (Site.objects.get_current().domain,
                                           reverse(url))

            print 'Analyzing %s...' % url

            try:
                page = StaticPage.objects.get(url=url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=url)
                pass

            try:
                html = urllib2.urlopen(url)
            except urllib2.URLError:
                print "Error while reading '%s'" % url
                continue

            soup = BeautifulSoup(html)
            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''
            page.language = soup.html.get('lang', 'en')
            page.content = soup.prettify()
            page.save()
            count += 1

        print 'Crawled %d static pages' % count