def __init__(self, rq=None): """ Create an Cassandra based HTTP cache. **Arguments:** * *cassandra_client* -- Cassandra client object. **Keyword arguments:** * *rq* -- Request Queuer object. (Default ``None``) """ if rq is None: self.rq = RequestQueuer() else: self.rq = rq
def __init__(self, config, pg=None): # Resource Mappings self.service_mapping = config["service_mapping"] self.service_args_mapping = config["service_args_mapping"] self.inverted_args_mapping = dict([(s[0], invert(s[1])) for s in self.service_args_mapping.items()]) # configure stats self.stats = config.get('stats', stats.stats) stats.stats = self.stats # Request Queuer self.rq = RequestQueuer( max_simultaneous_requests=config["max_simultaneous_requests"], max_requests_per_host_per_second=config["max_requests_per_host_per_second"], max_simultaneous_requests_per_host=config["max_simultaneous_requests_per_host"]) self.rq.setHostMaxRequestsPerSecond("127.0.0.1", 0) self.rq.setHostMaxSimultaneousRequests("127.0.0.1", 0) if pg is None: self.pg = PageGetter(rq=self.rq) else: self.pg = pg
def __init__(self, cassandra_client, redis_client, disable_negative_cache=False, time_offset=0, rq=None): """ Create an Cassandra based HTTP cache. **Arguments:** * *cassandra_client* -- Cassandra client object. **Keyword arguments:** * *rq* -- Request Queuer object. (Default ``None``) """ self.cassandra_client = cassandra_client self.redis_client = redis_client self.disable_negative_cache = disable_negative_cache self.time_offset = time_offset if rq is None: self.rq = RequestQueuer() else: self.rq = rq
def setUp(self): self.deferred = Deferred() self.mini_web_server = MiniWebServer() self.rq = RequestQueuer(max_requests_per_host_per_second=3, max_simultaneous_requests_per_host=5)
class RequestQueuerTestCase(unittest.TestCase): def setUp(self): self.deferred = Deferred() self.mini_web_server = MiniWebServer() self.rq = RequestQueuer(max_requests_per_host_per_second=3, max_simultaneous_requests_per_host=5) def tearDown(self): return self.mini_web_server.shutdown() def testRequestQueuerOnSuccess(self): d = self.rq.getPage("http://127.0.0.1:8080/helloworld", timeout=5) return d def testRequestQueuerOnFailure(self): d = self.rq.getPage("http://0.0.0.0:99", timeout=5) d.addErrback(self._getPageErrback) return d def testHostMaxRequestsPerSecond(self,): self.failUnlessEqual( self.rq.getHostMaxRequestsPerSecond("example.com"), 3) self.rq.setHostMaxRequestsPerSecond("example2.com", 7) self.failUnlessEqual( self.rq.getHostMaxRequestsPerSecond("example2.com"), 7) def testHostMaxSimultaneousRequests(self,): self.failUnlessEqual( self.rq.getHostMaxSimultaneousRequests("example.com"), 5) self.rq.setHostMaxSimultaneousRequests("example2.com", 11) self.failUnlessEqual( self.rq.getHostMaxSimultaneousRequests("example2.com"), 11) def testActive(self): self.failUnlessEqual(isinstance(self.rq.getActive(), int), True) def testPending(self): self.failUnlessEqual(isinstance(self.rq.getPending(), int), True) def testActiveRequestsByHost(self): self.failUnlessEqual(isinstance(self.rq.getActiveRequestsByHost(), dict), True) def testPendingRequestsByHost(self): self.failUnlessEqual(isinstance(self.rq.getPendingRequestsByHost(), dict), True) def _getPageErrback(self, error): return True
class PageGetter(object): def __init__(self, cassandra_client, redis_client, disable_negative_cache=False, time_offset=0, rq=None): """ Create an Cassandra based HTTP cache. **Arguments:** * *cassandra_client* -- Cassandra client object. **Keyword arguments:** * *rq* -- Request Queuer object. (Default ``None``) """ self.cassandra_client = cassandra_client self.redis_client = redis_client self.disable_negative_cache = disable_negative_cache self.time_offset = time_offset if rq is None: self.rq = RequestQueuer() else: self.rq = rq @inlineCallbacks def getPage(self, url, method='GET', postdata=None, headers=None, agent="HiiSpider", timeout=5, cookies=None, follow_redirect=1, prioritize=False, hash_url=None, cache=0, content_sha1=None, confirm_cache_write=False, check_only_tld=False, disable_negative_cache=False): """ Make a cached HTTP Request. **Arguments:** * *url* -- URL for the request. **Keyword arguments:** * *method* -- HTTP request method. (Default ``'GET'``) * *postdata* -- Dictionary of strings to post with the request. (Default ``None``) * *headers* -- Dictionary of strings to send as request headers. (Default ``None``) * *agent* -- User agent to send with request. (Default ``'HiiSpider'``) * *timeout* -- Request timeout, in seconds. (Default ``60``) * *cookies* -- Dictionary of strings to send as request cookies. (Default ``None``). * *follow_redirect* -- Boolean switch to follow HTTP redirects. (Default ``True``) * *prioritize* -- Move this request to the front of the request queue. (Default ``False``) * *hash_url* -- URL string used to indicate a common resource. Example: "http://digg.com" and "http://www.digg.com" could both use hash_url, "http://digg.com" (Default ``None``) * *cache* -- Cache mode. ``1``, immediately return contents of cache if available. ``0``, check resource, return cache if not stale. ``-1``, ignore cache. (Default ``0``) * *content_sha1* -- SHA-1 hash of content. If this matches the hash of data returned by the resource, raises a StaleContentException. * *confirm_cache_write* -- Wait to confirm cache write before returning. * *check_only_tld* -- for negative cache, check only the top level domain name * *disable_negative_cache* -- disable negative cache for this request """ start = time.time() request_kwargs = { "method":method.upper(), "postdata":postdata, "headers":headers, "agent":agent, "timeout":timeout, "cookies":cookies, "follow_redirect":follow_redirect, "prioritize":prioritize} cache = int(cache) if cache not in [-1,0,1]: raise Exception("Unknown caching mode.") if not isinstance(url, str): url = convertToUTF8(url) if hash_url is not None and not isinstance(hash_url, str): hash_url = convertToUTF8(hash_url) # check negative cache host = _parse(url)[1] # if check_only_tld is true then parse the url down to the top level domain if check_only_tld: host_split = host.split('.', host.count('.')-1) host = host_split[len(host_split)-1] # Create request_hash to serve as a cache key from # either the URL or user-provided hash_url. hash_items = [hash_url or url, agent] if postdata: hash_items.append(repr(postdata)) if headers and 'Authorization' in headers: items = headers['Authorization'].split(',') oauth_headers = [item for item in items if item.find('oauth_consumer_key') > -1 or item.find('oauth_token') > -1 or item.find('oauth_token_secret') > -1] if oauth_headers: hash_items.append(repr(oauth_headers)) if cookies: hash_items.append(repr(cookies)) request_hash = sha1(json.dumps(hash_items)).hexdigest() # if not disable_negative_cache and not self.disable_negative_cache: # yield self.checkNegativeCache( # 'negative_cache:%s' % host, # 'negative_req_cache:%s' % request_hash) # if request_kwargs["method"] != "GET": # data = yield self.rq.getPage(url, **request_kwargs) # else: # data = yield self._getPage( # url, # request_hash, # request_kwargs, # cache, # content_sha1, # confirm_cache_write, # host) # logger.info("Got %s after %s" % (host, time.time() - start)) # # Check for stale contents data = yield self.rq.getPage(url, **request_kwargs) if "content-sha1" not in data: data["content-sha1"] = sha1(data["response"]).hexdigest() if content_sha1 == data["content-sha1"]: logger.debug("Raising StaleContentException (4) on %s" % request_hash) raise StaleContentException(content_sha1) returnValue(data) @inlineCallbacks def checkNegativeCache(self, negative_cache_host_key, negative_req_cache_key): raw_negative_cache_host = yield self.redis_client.get(negative_cache_host_key) if raw_negative_cache_host: try: negative_cache_host = pickle.loads(str(decompress(raw_negative_cache_host))) if negative_cache_host['timeout'] > time.time(): # we get quite a lot of these, ~500/sec on occasions stats.stats.increment('pg.negcache.hit', 0.1) raise NegativeHostCacheException(str(negative_cache_host['error'])) except NegativeHostCacheException: raise except Exception, e: logger.error('Removing host %s from the negative cache: %s' % (request_hash, e)) stats.stats.increment('pg.negcache.flush') self.redis_client.delete(negative_cache_host_key) raw_negative_req_cache_item = yield self.redis_client.get(negative_req_cache_key) if raw_negative_req_cache_item: try: negative_req_cache_item = pickle.loads(str(decompress(raw_negative_req_cache_item))) if negative_req_cache_item['timeout'] > time.time(): stats.stats.increment('pg.negreqcache.hit') raise NegativeReqCacheException(str(negative_req_cache_item['error'])) except NegativeHostCacheException: raise except Exception, e: logger.error('Removing item %s from the negative cache: %s' % (negative_req_cache_key, e)) stats.stats.increment('pg.negreqcache.flush', 0.5) self.redis_client.delete(negative_req_cache_key)
class PageGetter(object): def __init__(self, cassandra_client, redis_client, disable_negative_cache=False, time_offset=0, rq=None): """ Create an Cassandra based HTTP cache. **Arguments:** * *cassandra_client* -- Cassandra client object. **Keyword arguments:** * *rq* -- Request Queuer object. (Default ``None``) """ self.cassandra_client = cassandra_client self.redis_client = redis_client self.disable_negative_cache = disable_negative_cache self.time_offset = time_offset if rq is None: self.rq = RequestQueuer() else: self.rq = rq @inlineCallbacks def getPage(self, url, method='GET', postdata=None, headers=None, agent="HiiSpider", timeout=5, cookies=None, follow_redirect=1, prioritize=False, hash_url=None, cache=0, content_sha1=None, confirm_cache_write=False, check_only_tld=False, disable_negative_cache=False): """ Make a cached HTTP Request. **Arguments:** * *url* -- URL for the request. **Keyword arguments:** * *method* -- HTTP request method. (Default ``'GET'``) * *postdata* -- Dictionary of strings to post with the request. (Default ``None``) * *headers* -- Dictionary of strings to send as request headers. (Default ``None``) * *agent* -- User agent to send with request. (Default ``'HiiSpider'``) * *timeout* -- Request timeout, in seconds. (Default ``60``) * *cookies* -- Dictionary of strings to send as request cookies. (Default ``None``). * *follow_redirect* -- Boolean switch to follow HTTP redirects. (Default ``True``) * *prioritize* -- Move this request to the front of the request queue. (Default ``False``) * *hash_url* -- URL string used to indicate a common resource. Example: "http://digg.com" and "http://www.digg.com" could both use hash_url, "http://digg.com" (Default ``None``) * *cache* -- Cache mode. ``1``, immediately return contents of cache if available. ``0``, check resource, return cache if not stale. ``-1``, ignore cache. (Default ``0``) * *content_sha1* -- SHA-1 hash of content. If this matches the hash of data returned by the resource, raises a StaleContentException. * *confirm_cache_write* -- Wait to confirm cache write before returning. * *check_only_tld* -- for negative cache, check only the top level domain name * *disable_negative_cache* -- disable negative cache for this request """ stats.stats.increment("pg.getpage", 0.05) start = time.time() request_kwargs = { "method":method.upper(), "postdata":postdata, "headers":headers, "agent":agent, "timeout":timeout, "cookies":cookies, "follow_redirect":follow_redirect, "prioritize":prioritize} cache = int(cache) if cache not in [-1,0,1]: raise Exception("Unknown caching mode.") if not isinstance(url, str): url = convertToUTF8(url) if hash_url is not None and not isinstance(hash_url, str): hash_url = convertToUTF8(hash_url) # check negative cache host = _parse(url)[1] # if check_only_tld is true then parse the url down to the top level domain if check_only_tld: host_split = host.split('.', host.count('.')-1) host = host_split[len(host_split)-1] # Create request_hash to serve as a cache key from # either the URL or user-provided hash_url. hash_items = [hash_url or url, agent] if postdata: hash_items.append(repr(postdata)) if headers and 'Authorization' in headers: items = headers['Authorization'].split(',') oauth_headers = [item for item in items if item.find('oauth_consumer_key') > -1 or item.find('oauth_token') > -1 or item.find('oauth_token_secret') > -1] if oauth_headers: hash_items.append(repr(oauth_headers)) if cookies: hash_items.append(repr(cookies)) request_hash = sha1(json.dumps(hash_items)).hexdigest() data = yield self.rq.getPage(url, **request_kwargs) if "content-sha1" not in data: data["content-sha1"] = sha1(data["response"]).hexdigest() if content_sha1 == data["content-sha1"]: stats.stats.increment('pg.stalecontent') raise StaleContentException(content_sha1) returnValue(data)
class BaseServer(object): exposed_functions = [] exposed_function_resources = {} logging_handler = None shutdown_trigger_id = None uuid = uuid4().hex start_time = time.time() active_jobs = {} reserved_arguments = [ "reservation_function_name", "reservation_created", "reservation_next_request", "reservation_error"] functions = {} delta_functions = {} categories = {} fast_cache = {} function_resource = None def __init__(self, config, pg=None): # Resource Mappings self.service_mapping = config["service_mapping"] self.service_args_mapping = config["service_args_mapping"] self.inverted_args_mapping = dict([(s[0], invert(s[1])) for s in self.service_args_mapping.items()]) # configure stats self.stats = config.get('stats', stats.stats) stats.stats = self.stats # Request Queuer self.rq = RequestQueuer( max_simultaneous_requests=config["max_simultaneous_requests"], max_requests_per_host_per_second=config["max_requests_per_host_per_second"], max_simultaneous_requests_per_host=config["max_simultaneous_requests_per_host"]) self.rq.setHostMaxRequestsPerSecond("127.0.0.1", 0) self.rq.setHostMaxSimultaneousRequests("127.0.0.1", 0) if pg is None: self.pg = PageGetter(rq=self.rq) else: self.pg = pg def start(self): start_deferred = Deferred() reactor.callWhenRunning(self._baseStart, start_deferred) return start_deferred def _baseStart(self, start_deferred): logger.debug("Starting Base components.") self.shutdown_trigger_id = reactor.addSystemEventTrigger( 'before', 'shutdown', self.shutdown) start_deferred.callback(True) @inlineCallbacks def shutdown(self): while self.rq.getPending() > 0 or self.rq.getActive() > 0: logger.debug("%s requests active, %s requests pending." % ( self.rq.getPending(), self.rq.getActive() )) shutdown_deferred = Deferred() # Call the Deferred after a second to continue the loop. reactor.callLater(1, shutdown_deferred.callback) yield shutdown_deferred self.shutdown_trigger_id = None logger.critical("Server shut down.") logger.removeHandler(self.logging_handler) returnValue(True) def getManholeFactory(self, namespace, **passwords): realm = manhole_ssh.TerminalRealm() def getManhole(_): return manhole.Manhole(namespace) realm.chainedProtocolFactory.protocolFactory = getManhole p = portal.Portal(realm) p.registerChecker( checkers.InMemoryUsernamePasswordDatabaseDontUse(**passwords)) f = manhole_ssh.ConchFactory(p) return f def delta(self, func, handler): self.delta_functions[id(func)] = handler def expose(self, *args, **kwargs): return self.makeCallable(expose=True, *args, **kwargs) @inlineCallbacks def executeJob(self, job): dotted_function = '.'.join(job.function_name.split('/')) timer = 'job.%s.duration' % (dotted_function) self.stats.timer.start(timer, 0.5) self.stats.timer.start('job.time', 0.1) if not job.mapped: job = self.mapJob(job) f = self.functions[job.function_name] if job.uuid is not None: self.active_jobs[job.uuid] = True if f["get_job_uuid"]: job.kwargs["job_uuid"] = job.uuid if f["check_fast_cache"]: job.kwargs["fast_cache"] = job.fast_cache try: data = yield self.executeFunction(job.function_name, **job.kwargs) except NegativeCacheException: self.stats.timer.stop(timer) self.stats.timer.stop('job.time') raise except QueueTimeoutException: self.stats.timer.stop(timer) self.stats.timer.stop('job.time') raise except Exception, e: self.stats.increment('job.%s.failure' % dotted_function) self.stats.timer.stop(timer) self.stats.timer.stop('job.time') raise finally:
class PageGetter: negitive_cache = {} def __init__(self, rq=None): """ Create an Cassandra based HTTP cache. **Arguments:** * *cassandra_client* -- Cassandra client object. **Keyword arguments:** * *rq* -- Request Queuer object. (Default ``None``) """ if rq is None: self.rq = RequestQueuer() else: self.rq = rq def getPage(self, url, method='GET', postdata=None, headers=None, agent="HiiSpider", timeout=60, cookies=None, follow_redirect=1, prioritize=False, hash_url=None, cache=0, content_sha1=None, confirm_cache_write=False, check_only_tld=False, disable_negative_cache=False, ): """ Make a cached HTTP Request. **Arguments:** * *url* -- URL for the request. **Keyword arguments:** * *method* -- HTTP request method. (Default ``'GET'``) * *postdata* -- Dictionary of strings to post with the request. (Default ``None``) * *headers* -- Dictionary of strings to send as request headers. (Default ``None``) * *agent* -- User agent to send with request. (Default ``'HiiSpider'``) * *timeout* -- Request timeout, in seconds. (Default ``60``) * *cookies* -- Dictionary of strings to send as request cookies. (Default ``None``). * *follow_redirect* -- Boolean switch to follow HTTP redirects. (Default ``True``) * *prioritize* -- Move this request to the front of the request queue. (Default ``False``) * *hash_url* -- URL string used to indicate a common resource. Example: "http://digg.com" and "http://www.digg.com" could both use hash_url, "http://digg.com" (Default ``None``) * *cache* -- Cache mode. ``1``, immediately return contents of cache if available. ``0``, check resource, return cache if not stale. ``-1``, ignore cache. (Default ``0``) * *content_sha1* -- SHA-1 hash of content. If this matches the hash of data returned by the resource, raises a StaleContentException. * *confirm_cache_write* -- Wait to confirm cache write before returning. """ request_kwargs = { "method":method.upper(), "postdata":postdata, "headers":headers, "agent":agent, "timeout":timeout, "cookies":cookies, "follow_redirect":follow_redirect, "prioritize":prioritize} cache = int(cache) cache=0 if cache not in [-1,0,1]: raise Exception("Unknown caching mode.") if not isinstance(url, str): url = convertToUTF8(url) if hash_url is not None and not isinstance(hash_url, str): hash_url = convertToUTF8(hash_url) # check negitive cache host = _parse(url)[1] # if check_only_tld is true then parse the url down to the top level domain if check_only_tld: host_split = host.split('.', host.count('.')-1) host = host_split[len(host_split)-1] if host in self.negitive_cache: if not self.negitive_cache[host]['timeout'] < time.time(): logger.error('Found %s in negitive cache, raising last known exception' % host) return self.negitive_cache[host]['error'].raiseException() # Create request_hash to serve as a cache key from # either the URL or user-provided hash_url. if hash_url is None: request_hash = hashlib.sha1(json.dumps([ url, agent])).hexdigest() else: request_hash = hashlib.sha1(json.dumps([ hash_url, agent])).hexdigest() d = self.rq.getPage(url, **request_kwargs) d.addCallback(self._checkForStaleContent, content_sha1, request_hash, host) d.addErrback(self._getPageErrback, host) return d def _checkForStaleContent(self, data, content_sha1, request_hash, host): if host in self.negitive_cache: logger.error('Removing %s from negitive cache' % host) del self.negitive_cache[host] if "content-sha1" not in data: data["content-sha1"] = hashlib.sha1(data["response"]).hexdigest() if content_sha1 == data["content-sha1"]: logger.debug("Raising StaleContentException (4) on %s" % request_hash) raise StaleContentException(content_sha1) else: return data def _getPageErrback(self, error, host): try: status = int(error.value.status) except: status = 500 if status >= 500: if not host in self.negitive_cache: logger.error('Adding %s to negitive cache' % host) self.negitive_cache[host] = { 'timeout': time.time() + 300, 'retries': 1, 'error': error } else: if self.negitive_cache[host]['retries'] <= 5: self.negitive_cache[host]['timeout'] = time.time() + 600 self.negitive_cache[host]['retries'] += 1 else: self.negitive_cache[host]['timeout'] = time.time() + 3600 self.negitive_cache[host]['retries'] += 1 self.negitive_cache[host]['error'] = error logger.error('Updating negitive cache for host %s which has failed %d times' % (host, self.negitive_cache[host]['retries'])) error.raiseException()