Exemplo n.º 1
0
def is_sitemap(body):
    parser = etree.XMLParser(recover=True, encoding='utf-8')
    try:
        return SITEMAP_SCHEMA.validate(etree.parse(StringIO(body), parser))
    except ValueError, e:
        logger.log_error('ValueError raised: %s, body %s' % (e, body))
        return False
Exemplo n.º 2
0
 def seed(self):
     for sitedef in self.db.sitedefs.find():
         hostname = None
         try:
             if not u'allowed_links' in sitedef or \
                not u'syntax' in sitedef or \
                not u'start_url' in sitedef or \
                not sitedef[u'start_url']:
                 logger.log_error('Bad site definition: ' + str(sitedef))
                 continue
             hostname = urlparse(sitedef[u'start_url']).netloc
             self.collection.insert({
                 'timestamp' : datetime.datetime.utcnow(),
                 'urls' : [sitedef[u'start_url']],
                 'hostname' : hostname,
                 'sitedef_id' : sitedef[u'_id']
             })
         except pymongo.errors.DuplicateKeyError:
             logger.log_warning('Duplicate key error : %s' % (str(sitedef)))
             if not hostname:
                 continue
             self.collection.update({
                 'hostname' : hostname
             }, {
                 '$addToSet' : {
                     'urls' : sitedef[u'start_url']
                 },
                 '$set' : {
                     'timestamp' : datetime.datetime.utcnow()
                 }
             })
             continue
Exemplo n.º 3
0
 def run(self):
     try:
         item = self.pop_result()
         self.reduce(item)
     except Exception as e:
         logger.log_error('Exception caught %s' % e)
         traceback.print_exc()
         #for now, silently continue
         return StrategyResult.PASSED
     return StrategyResult.SUCCESS
Exemplo n.º 4
0
def parse_links(body, url, selector_filter=DEFAULT_SELECTOR_FILTER, restrict_to_host=False):
    soup = BeautifulSoup(body, 'lxml')
    if not body or not soup or not url or not len(url):
        print '[WARNING] parse_links invoked with bad args : (%s,%s)' %\
            (body,url)
        return []
    try:
        all_links = soup.select(selector_filter)
    except ValueError, e:
        logger.log_error('[%s] Bad CSS Selector: %s | %s' % (url,selector_filter, e))
        return []
Exemplo n.º 5
0
 def upsert_result(self, result, pk=None):
     conn = r.connect(self.hostname, self.port)
     try:
         if pk:
             replace_query = {"id": pk}
             for key in result:
                 replace_query[key] = result[key]
             r.db(self.dbname).table(self.tablename).get(pk).replace(replace_query).run(conn)
         else:
             r.db(self.dbname).table(self.tablename).insert(result).run(conn)
         return True
     except r.errors.RqlDriverError, e:
         logger.log_error(
             "RethinkSchema.upsert_result failed to insert with params(%s,%s) Error: %s" % (result, pk, e)
         )
         return False
Exemplo n.º 6
0
def parse_syntax(body, syntax, url, container=None, soup=None):
    if not syntax:
        return None
    if not soup:
        soup = BeautifulSoup(body, 'lxml')
    result = {}
    for key in syntax:
        #keyword is reserved
        if key == 'container':
            pass
        elif (type(syntax[key]) == str or type(syntax[key]) == unicode) and len(syntax[key].strip()):
            selector = '%s' % (syntax[key])
            try:
                nodes = soup.select(selector)
            except IndexError, e:
                logger.log_error('[%s] IndexError with selector %s %s' % (url, selector, e))
                continue
            if len(nodes) == 0:
                logger.log_warning('[%s] Could not find selector %s' % (url, str(selector)))
                continue
            node = nodes[0]
            val = node.get_text(' ', strip=True)
            if not val or val == '':
                #fall back
                if 'value' in node:
                    val = val[u'value']
                #elif ...:

            result[key] = {
                'text' : val
            }
            if node.has_attr('href'):
                if url:
                    result[key]['href'] = urlparse.urljoin(url, node['href'])
                else:
                    result[key]['href'] = node['href']
        elif type(syntax[key]) == dict:
            result[key] = parse_syntax(body, syntax[key], url, soup=soup, container=container)
Exemplo n.º 7
0
    def run(self):
        with self.get_job(start_offset=self.crawl_delay) as job:
            #This is just so that we can retrieve url when printing an exception
            url = None
            try:
                if not job:
                    return
                url = job.get_url()
                #TODO: make respect robots configurable
                resp_args = {
                    'cache' : self.cache_requests,
                    'respect_robots' : config.respect_robots if config.respect_robots else True,
                }
                cookies = job.get_cookies()
                if cookies:
                    resp_args['cookies'] = cookies
                resp = job.get(url,**resp_args)
                if not resp:
                    logger.log_info('Failed to get url: ' + url)
                    return StrategyResult.PASSED
                if resp.status_code != 200:
                    logger.log_info('[WARNING] Job with URL %s failed with code(%s)' % \
                        (url, resp.status_code))
                    return StrategyResult.PASSED

                if self.can_scrape(url, job):
                    #maybe this can be put together into the same step
                    items = self.get_scraped_items(resp, job)
                    for item in items:
                        self.on_item_scraped(item, job)

                content_type = resp.headers.get('content-type')
                sitemap_links = None
                if content_type == 'text/xml' or \
                   content_type == 'application/xml' or \
                   url.endswith('.xml') and \
                   sitemap.is_sitemap(resp.content):
                    sitemap_links = sitemap.get_links(resp.content)


                follow_filter = job.get_follow_filter()
                links = [link.loc for link in sitemap_links] if sitemap_links else \
                        (linkparser.parse_links(
                            resp.content,
                            url,
                            restrict_to_host=True,
                            selector_filter=follow_filter
                        ) if follow_filter else \
                        linkparser.parse_links(
                            resp.content,
                            url,
                            restrict_to_host=True
                        ))
                if not follow_filter:
                    links = filter(lambda url: linkparser.can_follow(url, job.get_allowed_links()),links)
                for link in links:
                    job.add_url(link)

                if resp.cookies and len(resp.cookies):
                    cookies = resp.cookies
                    if isinstance(resp.cookies,RequestsCookieJar):
                        cookies = requests.utils.dict_from_cookiejar(resp.cookies)
                    job.set_cookies(cookies)

            except Exception as e:
                logger.log_error('[%s] Exception caught %s' % (url, e))
                return StrategyResult.PASSED
            return StrategyResult.SUCCESS