def test_can_wait_when_no_urls(self, logging_mock): otto = TornadoOctopus(cache=False, auto_start=True) otto.wait() logging_mock.assert_calls( 'No urls to wait for. Returning immediately.')
def download(): otto = TornadoOctopus( concurrency=100, auto_start=True, ) def enqueue(size, index, width, height): url = 'http://lorempixel.com/%d/%d/' % (width, height) print "Enqueuing %s image %d..." % (size, i + 1) otto.enqueue(url, handle_url_response(size, i + 1, width, height)) def handle_url_response(size, index, width, height): def handle(url, response): if response.status_code != 200: print "%s image %d (%d) failed." % (size, index, response.status_code) enqueue(size, index, width, height) return print "%s image %d (%d) saved." % (size, index, response.status_code) path = './tests/fixtures/imageset/%s' % size if not exists(path): os.makedirs(path) jpg = StringIO(response.text) img = Image.open(jpg) img.save('%s/image_%d.jpg' % (path, index)) return handle for i in range(NUM_IMAGES): for size, width, height in SIZES: enqueue(size, i + 1, width, height) otto.wait() # waits until queue is empty or timeout is ellapsed
def test_can_wait_when_urls_and_timeout(self): otto = TornadoOctopus(cache=False, auto_start=True) otto.ioloop = Mock() otto.running_urls = 10 otto.wait() expect(otto.ioloop.set_blocking_signal_threshold.called)
def test_can_wait_when_urls_and_no_timeout(self, logging_mock): otto = TornadoOctopus(cache=False, auto_start=True) otto.ioloop = Mock() otto.running_urls = 10 otto.wait(0) logging_mock.assert_calls('Waiting for urls to be retrieved.')
def test_can_enqueue_and_get_when_cache_miss(self): otto = TornadoOctopus(cache=True, auto_start=True) def response(url, response): self.url = url self.response = response otto.enqueue('http://www.google.com', response, method='GET') otto.wait(2) expect(otto.url_queue).to_be_empty() expect(self.response).not_to_be_null()
def test_can_handle_exception(self, log_mock): url = 'http://www.globo.com' otto = TornadoOctopus(concurrency=4, auto_start=True) def handle_url_response(url, response): raise RuntimeError(url) otto.enqueue(url, handle_url_response) otto.wait(2) log_mock.assert_called_once_with('Error calling callback for http://www.globo.com.')
def test_can_handle_timeouts(self): url = 'http://baidu.com' otto = TornadoOctopus(concurrency=1, request_timeout_in_seconds=0.1, auto_start=True) def handle_url_response(url, response): self.response = response otto.enqueue(url, handle_url_response) otto.wait(5) expect(self.response.status_code).to_equal(599) expect(self.response.text).to_be_null() expect(self.response.error).not_to_be_null()
def test_can_handle_exception(self, log_mock): url = 'http://www.globo.com' otto = TornadoOctopus(concurrency=4, auto_start=True) def handle_url_response(url, response): raise RuntimeError(url) otto.enqueue(url, handle_url_response) otto.wait(2) log_mock.assert_called_once_with( 'Error calling callback for http://www.globo.com.')
def test_can_handle_invalid_urls(self): url = 'http://kagdjdkjgka.fk' otto = TornadoOctopus(concurrency=1, auto_start=True) def handle_url_response(url, response): self.response = response otto.enqueue(url, handle_url_response) otto.wait(5) expect(self.response).not_to_be_null() expect(self.response.status_code).to_equal(599) expect(self.response.text).to_be_null() expect(self.response.error).not_to_be_null()
def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self): limiter = PerDomainInMemoryLimiter( {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1} ) otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) otto.enqueue('http://globoesporte.globo.com', self.handle_url_response) otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) otto.enqueue('http://g1.globo.com', self.handle_url_response) otto.enqueue('http://g1.globo.com/economia', self.handle_url_response) otto.wait(2) expect(self.responses).to_length(4) expect(list(limiter.domain_count.keys())).to_be_like(['http://g1.globo.com', 'http://globoesporte.globo.com'])
def test_should_call_limiter_miss_twice(self): limiter = PerDomainRedisLimiter( {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1}, redis=self.redis ) limiter.subscribe_to_lock_miss(self.handle_limiter_miss) otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response) otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) otto.enqueue('http://g1.globo.com/', self.handle_url_response) otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response) otto.wait() expect(self.cache_miss).to_length(2)
def test_should_call_limiter_miss_twice(self): limiter = PerDomainRedisLimiter({'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1}, redis=self.redis) limiter.subscribe_to_lock_miss(self.handle_limiter_miss) otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response) otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) otto.enqueue('http://g1.globo.com/', self.handle_url_response) otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response) otto.wait() expect(self.cache_miss).to_length(2)
def test_should_not_get_more_than_one_url_for_same_domain_concurrently( self): limiter = PerDomainInMemoryLimiter( {'http://g1.globo.com': 1}, {'http://globoesporte.globo.com': 1}) otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) otto.enqueue('http://globoesporte.globo.com', self.handle_url_response) otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) otto.enqueue('http://g1.globo.com', self.handle_url_response) otto.enqueue('http://g1.globo.com/economia', self.handle_url_response) otto.wait(2) expect(self.responses).to_length(4) expect(list(limiter.domain_count.keys())).to_be_like( ['http://g1.globo.com', 'http://globoesporte.globo.com'])
def tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=False): message = "Retrieving URLs concurrently with TornadoOctopus (%s)..." % ( ignore_pycurl and "using SimpleHTTPClient" or "using pycurl" ) print print("=" * len(message)) print(message) print("=" * len(message)) print otto = TornadoOctopus(concurrency=concurrency, cache=False, auto_start=True, ignore_pycurl=ignore_pycurl) for url in urls_to_retrieve: otto.enqueue(url, handle_url_response) start_time = time() otto.wait(0) return time() - start_time
def get_avatars(urls): avatars = [] otto = TornadoOctopus( concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60 ) def handle_url_response(url, response): if 'Not found' == response.text: print 'URL Not Found: %s' % url else: avatars.append(response.text) for url in urls: otto.enqueue(url, handle_url_response) otto.wait() return avatars
def test_can_get_many_urls(self): urls = [ 'http://www.globo.com', 'http://www.twitter.com', 'http://www.facebook.com' ] otto = TornadoOctopus(concurrency=4, auto_start=True) def handle_url_response(url, response): self.responses[url] = response for url in urls: otto.enqueue(url, handle_url_response) otto.wait(2) expect(self.responses).to_length(3) for url in urls: expect(self.responses).to_include(url) expect(self.responses[url].status_code).to_equal(200)
def _download_url_list(image_url_list): '''Downloads the image sources of images listed on `image_url_list` ''' images = [] otto = TornadoOctopus( concurrency=50, auto_start=True, cache=True, expiration_in_seconds=60 ) def handle_url_response(url, response): if 'Not found' == response.text: print url else: images.append(response.text) for url in image_url_list: otto.enqueue(url, handle_url_response) otto.wait(0) return images
def test_can_wait_when_no_urls(self, logging_mock): otto = TornadoOctopus(cache=False, auto_start=True) otto.wait() logging_mock.assert_calls('No urls to wait for. Returning immediately.')