def is_sitemap(body): parser = etree.XMLParser(recover=True, encoding='utf-8') try: return SITEMAP_SCHEMA.validate(etree.parse(StringIO(body), parser)) except ValueError, e: logger.log_error('ValueError raised: %s, body %s' % (e, body)) return False
def seed(self): for sitedef in self.db.sitedefs.find(): hostname = None try: if not u'allowed_links' in sitedef or \ not u'syntax' in sitedef or \ not u'start_url' in sitedef or \ not sitedef[u'start_url']: logger.log_error('Bad site definition: ' + str(sitedef)) continue hostname = urlparse(sitedef[u'start_url']).netloc self.collection.insert({ 'timestamp' : datetime.datetime.utcnow(), 'urls' : [sitedef[u'start_url']], 'hostname' : hostname, 'sitedef_id' : sitedef[u'_id'] }) except pymongo.errors.DuplicateKeyError: logger.log_warning('Duplicate key error : %s' % (str(sitedef))) if not hostname: continue self.collection.update({ 'hostname' : hostname }, { '$addToSet' : { 'urls' : sitedef[u'start_url'] }, '$set' : { 'timestamp' : datetime.datetime.utcnow() } }) continue
def run(self): try: item = self.pop_result() self.reduce(item) except Exception as e: logger.log_error('Exception caught %s' % e) traceback.print_exc() #for now, silently continue return StrategyResult.PASSED return StrategyResult.SUCCESS
def parse_links(body, url, selector_filter=DEFAULT_SELECTOR_FILTER, restrict_to_host=False): soup = BeautifulSoup(body, 'lxml') if not body or not soup or not url or not len(url): print '[WARNING] parse_links invoked with bad args : (%s,%s)' %\ (body,url) return [] try: all_links = soup.select(selector_filter) except ValueError, e: logger.log_error('[%s] Bad CSS Selector: %s | %s' % (url,selector_filter, e)) return []
def upsert_result(self, result, pk=None): conn = r.connect(self.hostname, self.port) try: if pk: replace_query = {"id": pk} for key in result: replace_query[key] = result[key] r.db(self.dbname).table(self.tablename).get(pk).replace(replace_query).run(conn) else: r.db(self.dbname).table(self.tablename).insert(result).run(conn) return True except r.errors.RqlDriverError, e: logger.log_error( "RethinkSchema.upsert_result failed to insert with params(%s,%s) Error: %s" % (result, pk, e) ) return False
def parse_syntax(body, syntax, url, container=None, soup=None): if not syntax: return None if not soup: soup = BeautifulSoup(body, 'lxml') result = {} for key in syntax: #keyword is reserved if key == 'container': pass elif (type(syntax[key]) == str or type(syntax[key]) == unicode) and len(syntax[key].strip()): selector = '%s' % (syntax[key]) try: nodes = soup.select(selector) except IndexError, e: logger.log_error('[%s] IndexError with selector %s %s' % (url, selector, e)) continue if len(nodes) == 0: logger.log_warning('[%s] Could not find selector %s' % (url, str(selector))) continue node = nodes[0] val = node.get_text(' ', strip=True) if not val or val == '': #fall back if 'value' in node: val = val[u'value'] #elif ...: result[key] = { 'text' : val } if node.has_attr('href'): if url: result[key]['href'] = urlparse.urljoin(url, node['href']) else: result[key]['href'] = node['href'] elif type(syntax[key]) == dict: result[key] = parse_syntax(body, syntax[key], url, soup=soup, container=container)
def run(self): with self.get_job(start_offset=self.crawl_delay) as job: #This is just so that we can retrieve url when printing an exception url = None try: if not job: return url = job.get_url() #TODO: make respect robots configurable resp_args = { 'cache' : self.cache_requests, 'respect_robots' : config.respect_robots if config.respect_robots else True, } cookies = job.get_cookies() if cookies: resp_args['cookies'] = cookies resp = job.get(url,**resp_args) if not resp: logger.log_info('Failed to get url: ' + url) return StrategyResult.PASSED if resp.status_code != 200: logger.log_info('[WARNING] Job with URL %s failed with code(%s)' % \ (url, resp.status_code)) return StrategyResult.PASSED if self.can_scrape(url, job): #maybe this can be put together into the same step items = self.get_scraped_items(resp, job) for item in items: self.on_item_scraped(item, job) content_type = resp.headers.get('content-type') sitemap_links = None if content_type == 'text/xml' or \ content_type == 'application/xml' or \ url.endswith('.xml') and \ sitemap.is_sitemap(resp.content): sitemap_links = sitemap.get_links(resp.content) follow_filter = job.get_follow_filter() links = [link.loc for link in sitemap_links] if sitemap_links else \ (linkparser.parse_links( resp.content, url, restrict_to_host=True, selector_filter=follow_filter ) if follow_filter else \ linkparser.parse_links( resp.content, url, restrict_to_host=True )) if not follow_filter: links = filter(lambda url: linkparser.can_follow(url, job.get_allowed_links()),links) for link in links: job.add_url(link) if resp.cookies and len(resp.cookies): cookies = resp.cookies if isinstance(resp.cookies,RequestsCookieJar): cookies = requests.utils.dict_from_cookiejar(resp.cookies) job.set_cookies(cookies) except Exception as e: logger.log_error('[%s] Exception caught %s' % (url, e)) return StrategyResult.PASSED return StrategyResult.SUCCESS