Пример #1
0
def run(app):
    singleton.SingleInstance('crawler')

    app = spoof_request(app)  # noqa
    login_as_admin(app)  # noqa

    count = 0

    while True:
        try:
            if 'site-id' in sys.argv:
                siteid = sys.argv['site-id']
                setup_site(app[siteid])
                crawl_site(app[siteid])  # noqa
            else:
                for oid in app.objectIds():  # noqa
                    obj = app[oid]  # noqa
                    if IPloneSiteRoot.providedBy(obj):
                        try:
                            setup_site(obj)
                            obj._p_jar.sync()
                            crawl_site(obj, count % 10 == 0)
                        except Exception:
                            logger.error('Error crawling site %s' % oid,
                                         exc_info=True)
        except KeyError:
            pass
        except Exception:
            logger.error('Error setting up crawling', exc_info=True)

        logger.info('Waiting to crawl again')
        time.sleep(10 * 60)
        count += 1
Пример #2
0
    def crawl_page(self, url):
        logger.info('Indexing ' + url)
        resp = requests.get(url, stream=True, headers={
            'User-Agent': self.settings.crawler_user_agent
        })
        if resp.status_code == 404 or \
          'html' not in resp.headers.get('content-type', '') or \
          int(resp.headers.get('content-length', 0)) \
              >= MAX_PAGE_SIZE:
            # remove from index
            return False
        dom = html.fromstring(resp.content)
        parsed = urlparse(url)
        data = {
            'url': url,
            'domain': parsed.netloc
        }

        for name, selectors in self._meta_properties.items():
            for selector in selectors:
                result = dom.cssselect(selector)
                if len(result) > 0:
                    result = result[0]
                    if result.attrib.get('content'):
                        data[name] = result.attrib['content']
                        break
                    elif result.text_content().strip():
                        data[name] = result.text.strip()
                        break

        for date_field in self.date_fields:
            val = data.get(date_field)
            if val:
                try:
                    data[date_field] = DateTime(val).ISO8601()
                except Exception:
                    pass

        searchable_text = [
            data.get('Title', ''),
            data.get('Description', '')
        ]
        for el in dom.cssselect(self.searchable_text_selector):
            searchable_text.append(el.text_content())
        data['SearchableText'] = ' '.join(searchable_text)

        return data
Пример #3
0
    def crawl_page(self, url):
        logger.info('Indexing ' + url)
        try:
            resp = requests.get(
                url, headers={'User-Agent': self.settings.crawler_user_agent})
        except Exception:
            # unable to access the page, remove for now
            return False
        if resp.status_code == 404 or 'html' not in resp.headers.get(
                'content-type', ''):
            # remove from index
            return False
        try:
            dom = html.fromstring(resp.content)
        except etree.XMLSyntaxError:
            # unable to parse html, remove for now
            return False  # lxml has been known to throw this as a bug, maybe use BeautifulSoup
        parsed = urlparse(url)
        data = {'url': url, 'domain': parsed.netloc}

        for name, selectors in self._meta_properties.items():
            for selector in selectors:
                result = dom.cssselect(selector)
                if len(result) > 0:
                    result = result[0]
                    if result.attrib.get('content'):
                        data[name] = result.attrib['content']
                        break
                    elif result.text_content().strip():
                        data[name] = result.text.strip()
                        break

        for date_field in self.date_fields:
            val = data.get(date_field)
            if val:
                try:
                    data[date_field] = DateTime(val).ISO8601()
                except Exception:
                    pass

        searchable_text = [data.get('Title', ''), data.get('Description', '')]
        for el in dom.cssselect(self.searchable_text_selector):
            searchable_text.append(el.text_content())
        data['SearchableText'] = ' '.join(searchable_text)

        return data
Пример #4
0
def run(app):
    singleton.SingleInstance('twittermonitor')

    user = app.acl_users.getUser('admin')  # noqa
    newSecurityManager(None, user.__of__(app.acl_users))  # noqa

    while True:
        try:
            if 'site-id' in sys.argv:
                siteid = sys.argv['site-id']
                attempt_twitter_on_site(app[siteid])  # noqa
            else:
                for oid in app.objectIds():  # noqa
                    obj = app[oid]  # noqa
                    if IPloneSiteRoot.providedBy(obj):
                        attempt_twitter_on_site(obj)
        except KeyError:
            pass
        logger.info('Could not find valid site to monitor')
        time.sleep(10 * 60)
Пример #5
0
def crawl_site(site, full=False):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ICrawlerConfiguration, prefix='castle')
    if not settings.crawler_active or not settings.crawler_site_maps:
        logger.info("Crawler must first be enabled in Site Setup")
        return False

    catalog = api.portal.get_tool('portal_catalog')
    es = ElasticSearchCatalog(catalog)
    index_name = '{site_index_name}_crawler'.format(
        site_index_name=es.index_name)
    if not es.enabled:
        logger.info(
            "Elasticsearch must be enabled in Site Setup to use crawler")
        return False

    # check index type is mapped, create if not
    try:
        es.connection.indices.get_mapping(index=index_name)
    except NotFoundError:
        # need to add it
        adapter = getMultiAdapter((getRequest(), es), IMappingProvider)
        mapping = adapter()
        mapping['properties'].update(CRAWLER_ES_MAPPING)
        if not es.connection.indices.exists(index_name):
            es.connection.indices.create(index_name)
        es.connection.indices.put_mapping(body=mapping, index=index_name)

    crawler = Crawler(site, settings, es)

    if settings.crawler_index_archive:
        crawler.crawl_archives()

    for sitemap in settings.crawler_site_maps:
        try:
            crawler.crawl_site_map(sitemap, full)
        except Exception:
            logger.error('Error crawling site map: %s' % sitemap,
                         exc_info=True)
    return True