def test_can_enqueue_url_and_fetch(self, fetch_mock): otto = TornadoOctopus(cache=True) otto.enqueue('http://www.google.com', None, method='GET', something="else") expect(otto.url_queue).to_be_empty() fetch_mock.assert_called_once_with('http://www.google.com', None, 'GET', something='else')
def test_can_enqueue_url(self): otto = TornadoOctopus(cache=False, concurrency=0) otto.enqueue('http://www.google.com', None, method='GET', something="else") expect(otto.url_queue).to_length(1)
def test_can_enqueue_and_get_when_cache_miss(self): otto = TornadoOctopus(cache=True, auto_start=True) def response(url, response): self.url = url self.response = response otto.enqueue('http://www.google.com', response, method='GET') otto.wait(2) expect(otto.url_queue).to_be_empty() expect(self.response).not_to_be_null()
def test_can_handle_exception(self, log_mock): url = 'http://www.globo.com' otto = TornadoOctopus(concurrency=4, auto_start=True) def handle_url_response(url, response): raise RuntimeError(url) otto.enqueue(url, handle_url_response) otto.wait(2) log_mock.assert_called_once_with('Error calling callback for http://www.globo.com.')
def test_can_enqueue_and_get_from_cache(self): mock_response = Mock() otto = TornadoOctopus(cache=True) otto.response_cache.put('http://www.google.com', mock_response) def response(url, response): self.url = url self.response = response otto.enqueue('http://www.google.com', response, method='GET') expect(otto.url_queue).to_be_empty() expect(self.response).not_to_be_null() expect(self.response).to_equal(mock_response)
def test_can_handle_timeouts(self): url = 'http://baidu.com' otto = TornadoOctopus(concurrency=1, request_timeout_in_seconds=0.1, auto_start=True) def handle_url_response(url, response): self.response = response otto.enqueue(url, handle_url_response) otto.wait(5) expect(self.response.status_code).to_equal(599) expect(self.response.text).to_be_null() expect(self.response.error).not_to_be_null()
def test_can_handle_exception(self, log_mock): url = 'http://www.globo.com' otto = TornadoOctopus(concurrency=4, auto_start=True) def handle_url_response(url, response): raise RuntimeError(url) otto.enqueue(url, handle_url_response) otto.wait(2) log_mock.assert_called_once_with( 'Error calling callback for http://www.globo.com.')
def test_can_handle_invalid_urls(self): url = 'http://kagdjdkjgka.fk' otto = TornadoOctopus(concurrency=1, auto_start=True) def handle_url_response(url, response): self.response = response otto.enqueue(url, handle_url_response) otto.wait(5) expect(self.response).not_to_be_null() expect(self.response.status_code).to_equal(599) expect(self.response.text).to_be_null() expect(self.response.error).not_to_be_null()
def tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=False): message = "Retrieving URLs concurrently with TornadoOctopus (%s)..." % ( ignore_pycurl and "using SimpleHTTPClient" or "using pycurl" ) print print("=" * len(message)) print(message) print("=" * len(message)) print otto = TornadoOctopus(concurrency=concurrency, cache=False, auto_start=True, ignore_pycurl=ignore_pycurl) for url in urls_to_retrieve: otto.enqueue(url, handle_url_response) start_time = time() otto.wait(0) return time() - start_time
def get_avatars(urls): avatars = [] otto = TornadoOctopus( concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60 ) def handle_url_response(url, response): if 'Not found' == response.text: print 'URL Not Found: %s' % url else: avatars.append(response.text) for url in urls: otto.enqueue(url, handle_url_response) otto.wait() return avatars
def test_can_get_many_urls(self): urls = [ 'http://www.globo.com', 'http://www.twitter.com', 'http://www.facebook.com' ] otto = TornadoOctopus(concurrency=4, auto_start=True) def handle_url_response(url, response): self.responses[url] = response for url in urls: otto.enqueue(url, handle_url_response) otto.wait(2) expect(self.responses).to_length(3) for url in urls: expect(self.responses).to_include(url) expect(self.responses[url].status_code).to_equal(200)
def _download_url_list(image_url_list): '''Downloads the image sources of images listed on `image_url_list` ''' images = [] otto = TornadoOctopus( concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60 ) def handle_url_response(url, response): if 'Not found' == response.text: print url else: images.append(response.text) for url in image_url_list: otto.enqueue(url, handle_url_response) otto.wait(0) return images
def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self): limiter = PerDomainInMemoryLimiter( {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1} ) otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) otto.enqueue('http://globoesporte.globo.com', self.handle_url_response) otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) otto.enqueue('http://g1.globo.com', self.handle_url_response) otto.enqueue('http://g1.globo.com/economia', self.handle_url_response) otto.wait(2) expect(self.responses).to_length(4) expect(list(limiter.domain_count.keys())).to_be_like(['http://g1.globo.com', 'http://globoesporte.globo.com'])
def test_should_not_get_more_than_one_url_for_same_domain_concurrently( self): limiter = PerDomainInMemoryLimiter( {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1}) otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) otto.enqueue('http://globoesporte.globo.com', self.handle_url_response) otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) otto.enqueue('http://g1.globo.com', self.handle_url_response) otto.enqueue('http://g1.globo.com/economia', self.handle_url_response) otto.wait(2) expect(self.responses).to_length(4) expect(list(limiter.domain_count.keys())).to_be_like( ['http://g1.globo.com', 'http://globoesporte.globo.com'])
def test_should_call_limiter_miss_twice(self): limiter = PerDomainRedisLimiter({'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1}, redis=self.redis) limiter.subscribe_to_lock_miss(self.handle_limiter_miss) otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response) otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) otto.enqueue('http://g1.globo.com/', self.handle_url_response) otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response) otto.wait() expect(self.cache_miss).to_length(2)
def test_should_call_limiter_miss_twice(self): limiter = PerDomainRedisLimiter( {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1}, redis=self.redis ) limiter.subscribe_to_lock_miss(self.handle_limiter_miss) otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response) otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) otto.enqueue('http://g1.globo.com/', self.handle_url_response) otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response) otto.wait() expect(self.cache_miss).to_length(2)
class BaseWorker(BaseCLI): def _load_validators(self): return load_classes(default=self.config.VALIDATORS) def _load_facters(self): return load_classes(default=self.config.FACTERS) def get_otto_limiter(self): domains = self.cache.get_domain_limiters() limiter = None if domains: limiter = Limiter( *domains, redis=self.redis, expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION ) limiter.subscribe_to_lock_miss(self.handle_limiter_miss) return limiter def update_otto_limiter(self): domains = self.cache.get_domain_limiters() if hasattr(self.otto, 'limiter'): self.otto.limiter.update_domain_definitions(*domains) def start_otto(self): self.info('Starting Octopus with %d concurrent threads.' % self.options.concurrency) self.otto = TornadoOctopus( concurrency=self.options.concurrency, cache=self.options.cache, connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS, request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS, limiter=self.get_otto_limiter() ) self.otto.start() def handle_error(self, exc_type, exc_value, tb): for handler in self.error_handlers: handler.handle_exception( exc_type, exc_value, tb, extra={ 'worker-uuid': self.uuid, 'holmes-version': __version__ } ) def async_get(self, url, handler, method='GET', **kw): url, response = self.cache.get_request(url) if not response: kw['proxy_host'] = self.config.HTTP_PROXY_HOST kw['proxy_port'] = self.config.HTTP_PROXY_PORT self.debug('Enqueueing %s for %s...' % (method, url)) self.otto.enqueue(url, self.handle_response(url, handler), method, **kw) else: handler(url, response) def handle_response(self, url, handler): def handle(url, response): self.cache.set_request( url, response.status_code, response.headers, response.cookies, response.text, response.effective_url, response.error, response.request_time, self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS ) handler(url, response) return handle def handle_limiter_miss(self, url): pass def publish(self, data): self.redis_pub_sub.publish('events', data) def _insert_keys(self, keys): from holmes.models import Key for name in keys.keys(): self.db.begin(subtransactions=True) key = Key.get_or_create(self.db, name) keys[name]['key'] = key self.db.add(key) self.db.commit()
class BaseWorker(BaseCLI): def _load_validators(self): return load_classes(default=self.config.VALIDATORS) def _load_facters(self): return load_classes(default=self.config.FACTERS) def get_otto_limiter(self): domains = self.cache.get_domain_limiters() limiter = None if domains: limiter = Limiter( *domains, redis=self.redis, expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION ) limiter.subscribe_to_lock_miss(self.handle_limiter_miss) return limiter def update_otto_limiter(self): domains = self.cache.get_domain_limiters() if hasattr(self.otto, 'limiter') and self.otto.limiter is not None: self.otto.limiter.update_domain_definitions(*domains) def start_otto(self): self.info('Starting Octopus with %d concurrent threads.' % self.options.concurrency) self.otto = TornadoOctopus( concurrency=self.options.concurrency, cache=self.options.cache, connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS, request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS, limiter=self.get_otto_limiter() ) self.otto.start() def handle_error(self, exc_type, exc_value, tb): try: if not self.db.connection().invalidated: self.db.rollback() except Exception: err = sys.exc_info()[1] logging.error("Cannot rollback: %s" % str(err)) self.otto.url_queue = [] self.db.close_all() self.db.remove() self.db = scoped_session(self.sqlalchemy_db_maker) for handler in self.error_handlers: handler.handle_exception( exc_type, exc_value, tb, extra={ 'worker-uuid': self.uuid, 'holmes-version': __version__ } ) def async_get(self, url, handler, method='GET', **kw): url, response = self.cache.get_request(url) kw['user_agent'] = self.config.HOLMES_USER_AGENT if not response: kw['proxy_host'] = self.config.HTTP_PROXY_HOST kw['proxy_port'] = self.config.HTTP_PROXY_PORT self.debug('Enqueueing %s for %s...' % (method, url)) self.otto.enqueue(url, self.handle_response(url, handler), method, **kw) else: handler(url, response) def handle_response(self, url, handler): def handle(url, response): self.cache.set_request( url, response.status_code, response.headers, response.cookies, response.text, response.effective_url, response.error, response.request_time, self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS ) handler(url, response) return handle def handle_limiter_miss(self, url): pass def publish(self, data): self.redis_pub_sub.publish('events', data)
class BaseWorker(BaseCLI): def _load_validators(self): return load_classes(default=self.config.VALIDATORS) def _load_facters(self): return load_classes(default=self.config.FACTERS) def get_otto_limiter(self): domains = self.cache.get_domain_limiters() limiter = None if domains: limiter = Limiter( *domains, redis=self.redis, expiration_in_seconds=self.config.LIMITER_LOCKS_EXPIRATION) limiter.subscribe_to_lock_miss(self.handle_limiter_miss) return limiter def update_otto_limiter(self): domains = self.cache.get_domain_limiters() if hasattr(self.otto, 'limiter') and self.otto.limiter is not None: self.otto.limiter.update_domain_definitions(*domains) def start_otto(self): self.info('Starting Octopus with %d concurrent threads.' % self.options.concurrency) self.otto = TornadoOctopus( concurrency=self.options.concurrency, cache=self.options.cache, connect_timeout_in_seconds=self.config.CONNECT_TIMEOUT_IN_SECONDS, request_timeout_in_seconds=self.config.REQUEST_TIMEOUT_IN_SECONDS, limiter=self.get_otto_limiter()) self.otto.start() def handle_error(self, exc_type, exc_value, tb): try: if not self.db.connection().invalidated: self.db.rollback() except Exception: err = sys.exc_info()[1] logging.error("Cannot rollback: %s" % str(err)) self.otto.url_queue = [] self.db.close_all() self.db.remove() self.db = scoped_session(self.sqlalchemy_db_maker) for handler in self.error_handlers: handler.handle_exception(exc_type, exc_value, tb, extra={ 'worker-uuid': self.uuid, 'holmes-version': __version__ }) def async_get(self, url, handler, method='GET', **kw): url, response = self.cache.get_request(url) kw['user_agent'] = self.config.HOLMES_USER_AGENT if not response: kw['proxy_host'] = self.config.HTTP_PROXY_HOST kw['proxy_port'] = self.config.HTTP_PROXY_PORT self.debug('Enqueueing %s for %s...' % (method, url)) self.otto.enqueue(url, self.handle_response(url, handler), method, **kw) else: handler(url, response) def handle_response(self, url, handler): def handle(url, response): self.cache.set_request( url, response.status_code, response.headers, response.cookies, response.text, response.effective_url, response.error, response.request_time, self.config.REQUEST_CACHE_EXPIRATION_IN_SECONDS) handler(url, response) return handle def handle_limiter_miss(self, url): pass def publish(self, data): self.redis_pub_sub.publish('events', data)