class Distributor(): def __init__(self): self.mongo = MongoClient('mongodb://localhost:27017/') crooshdb = self.mongo.crooshdb self.taskqueue = MongoQueue( crooshdb.taskqueue, consumer_id="distributor", timeout=300, max_attempts=3) def addJob(self, job): for task in job.tasks: self.taskqueue.put(task.toJSON())
class MultiThreadingCrawler(object): def __init__(self): self.queue = MongoQueue() self.gap = 1 def producer(self): for i in range(100): self.queue.put(i) def consumer(self): while True: if not self.queue.empty(): item = self.queue.get() print(item) self.queue.task_done() else: break def run(self, max_threads, *args, **kwargs): producer = threading.Thread(target=self.producer) producer.start() # Let the producer run for a while time.sleep(self.gap) threads = [] while not self.queue.empty(): for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and not self.queue.empty(): thread = threading.Thread(target=self.consumer) thread.setDaemon(True) thread.start() threads.append(thread) # all threads have been processed # sleep temporarily so CPU can focus execution on other threads time.sleep(self.gap) # Waiting for all elements to be processed self.queue.join() def __call__(self, *args, **kwargs): self.run(*args, **kwargs)
class MongoQueueRetryTimeTests(TestCase): def setUp(self): self.client = pymongo.MongoClient() self.db = self.client.test_queue self.queue = MongoQueue(self.db.queue_1, "consumer_1", retry_after=2) def tearDown(self): self.client.drop_database("test_queue") def test_complete_scenario(self): self.queue.put({"message": "hello"}) job = self.queue.next() with job as data: raise Exception time.sleep(1) job2 = self.queue.next() self.assertEqual(job2, None) time.sleep(1.1) job3 = self.queue.next() with job as data: self.assertEqual(data["message"], "hello") job4 = self.queue.next() self.assertEqual(job4, None) def test_error_with_increased_retry(self): self.queue.put({"message": "hello"}) job = self.queue.next() job.error(custom_retry_after=3) time.sleep(1) job = self.queue.next() self.assertEqual(job, None) time.sleep(2.1) job2 = self.queue.next() with job2 as data: self.assertEqual(data["message"], "hello") job3 = self.queue.next() self.assertEqual(job3, None) def test_release_with_increased_retry(self): self.queue.put({"message": "hello"}) job = self.queue.next() job.release(custom_retry_after=3) time.sleep(1) job = self.queue.next() self.assertEqual(job, None) time.sleep(2.1) job2 = self.queue.next() with job2 as data: self.assertEqual(data["message"], "hello") job3 = self.queue.next() self.assertEqual(job3, None)
class MongoQueueTest(TestCase): def setUp(self): self.client = pymongo.MongoClient() self.db = self.client.test_queue self.queue = MongoQueue(self.db.queue_1, "consumer_1") def tearDown(self): self.client.drop_database("test_queue") def assert_job_equal(self, job, data): for k, v in data.items(): self.assertEqual(job.payload[k], v) def test_put_next(self): data = {"context_id": "alpha", "data": [1, 2, 3], "more-data": time.time()} self.queue.put(dict(data)) job = self.queue.next() self.assert_job_equal(job, data) job = self.queue.next() self.assertEqual(job, None) def test_atomic_next(self): data = {"context_id": "alpha321", "data": [1, 2, 3], "more-data": time.time()} self.queue.put(dict(data)) p = mp.Pool() q = self.queue jobs = p.map(dequeue, [1,2]) self.assertNotEqual(jobs[0], jobs[1]) def test_get_empty_queue(self): job = self.queue.next() self.assertEqual(job, None) def test_priority(self): self.queue.put({"name": "alice"}, priority=1) self.queue.put({"name": "bob"}, priority=2) self.queue.put({"name": "mike"}, priority=0) self.assertEqual( ["bob", "alice", "mike"], [self.queue.next().payload['name'], self.queue.next().payload['name'], self.queue.next().payload['name']]) job = self.queue.next() self.assertEqual(job, None) def test_complete(self): data = {"context_id": "alpha", "data": [1, 2, 3], "more-data": datetime.now()} self.queue.put(data) self.assertEqual(self.queue.size(), 1) job = self.queue.next() job.complete() self.assertEqual(self.queue.size(), 0) job = self.queue.next() self.assertEqual(job, None) def test_release(self): data = {"context_id": "alpha", "data": [1, 2, 3], "more-data": time.time()} self.queue.put(data) job = self.queue.next() job.release() self.assertEqual(self.queue.size(), 1) job = self.queue.next() self.assert_job_equal(job, data) job = self.queue.next() self.assertEqual(job, None) def test_max_attempts(self): data = {"context_id": "alpha", "ts": time.time()} self.queue.put(dict(data)) attempts = 0 for i in xrange(0, self.queue.max_attempts): job = self.queue.next() if not job: break with job: attempts += 1 raise Exception() self.assertEqual(attempts, self.queue.max_attempts) def test_error(self): pass def test_progress(self): pass def test_stats(self): for i in range(5): data = {"context_id": "alpha", "data": [1, 2, 3], "more-data": time.time()} self.queue.put(data) job = self.queue.next() job.error("problem") stats = self.queue.stats() self.assertEqual({'available': 5, 'total': 5, 'locked': 0, 'errors': 0}, stats) def test_context_manager_error(self): self.queue.put({"foobar": 1}) job = self.queue.next() try: with job as data: self.assertEqual(data['payload']["foobar"], 1) # Item is returned to the queue on error raise SyntaxError except SyntaxError: pass job = self.queue.next() self.assertEqual(job.data['attempts'], 1) def test_context_manager_complete(self): self.queue.put({"foobar": 1}) job = self.queue.next() with job as data: self.assertEqual(data['payload']["foobar"], 1) job = self.queue.next() self.assertEqual(job, None) def test_next_by_payload(self): self.queue.put({"type": "first_type", "param":"param1"}) self.queue.put({"type": "second_type", "param":"param2"}) self.queue.put({"type": "third_type", "param":"param3"}) job = self.queue.next({"payload.type": "second_type"}) with job as data: self.assertEqual(data["payload"]["param"], "param2") job = self.queue.next({"payload.type": "third_type"}) with job as data: self.assertEqual(data["payload"]["param"], "param3") job = self.queue.next({"payload.type": "fourth_type"}) self.assertEqual(job, None) job = self.queue.next() with job as data: self.assertEqual(data["payload"]["param"], "param1") job = self.queue.next() self.assertEqual(job, None)
class MongoQueueTest(TestCase): def setUp(self): self.client = pymongo.MongoClient() self.db = self.client.test_queue self.queue = MongoQueue(self.db.queue_1, "consumer_1") def tearDown(self): self.client.drop_database("test_queue") def assert_job_equal(self, job, data): for k, v in data.items(): self.assertEqual(job.payload[k], v) def test_put_next(self): data = { "context_id": "alpha", "data": [1, 2, 3], "more-data": time.time() } self.queue.put(dict(data)) job = self.queue.next() self.assert_job_equal(job, data) job = self.queue.next() self.assertEqual(job, None) def test_atomic_next(self): data = { "context_id": "alpha321", "data": [1, 2, 3], "more-data": time.time() } self.queue.put(dict(data)) p = mp.Pool() q = self.queue jobs = p.map(dequeue, [1, 2]) self.assertNotEqual(jobs[0], jobs[1]) def test_get_empty_queue(self): job = self.queue.next() self.assertEqual(job, None) def test_priority(self): self.queue.put({"name": "alice"}, priority=1) self.queue.put({"name": "bob"}, priority=2) self.queue.put({"name": "mike"}, priority=0) self.assertEqual(["bob", "alice", "mike"], [ self.queue.next().payload['name'], self.queue.next().payload['name'], self.queue.next().payload['name'] ]) job = self.queue.next() self.assertEqual(job, None) def test_complete(self): data = { "context_id": "alpha", "data": [1, 2, 3], "more-data": datetime.now() } self.queue.put(data) self.assertEqual(self.queue.size(), 1) job = self.queue.next() job.complete() self.assertEqual(self.queue.size(), 0) job = self.queue.next() self.assertEqual(job, None) def test_release(self): data = { "context_id": "alpha", "data": [1, 2, 3], "more-data": time.time() } self.queue.put(data) job = self.queue.next() job.release() self.assertEqual(self.queue.size(), 1) job = self.queue.next() self.assert_job_equal(job, data) job = self.queue.next() self.assertEqual(job, None) def test_max_attempts(self): data = {"context_id": "alpha", "ts": time.time()} self.queue.put(dict(data)) attempts = 0 for i in xrange(0, self.queue.max_attempts): job = self.queue.next() if not job: break with job: attempts += 1 raise Exception() self.assertEqual(attempts, self.queue.max_attempts) def test_error(self): pass def test_progress(self): pass def test_stats(self): for i in range(5): data = { "context_id": "alpha", "data": [1, 2, 3], "more-data": time.time() } self.queue.put(data) job = self.queue.next() job.error("problem") stats = self.queue.stats() self.assertEqual({ 'available': 5, 'total': 5, 'locked': 0, 'errors': 0 }, stats) def test_context_manager_error(self): self.queue.put({"foobar": 1}) job = self.queue.next() try: with job as data: self.assertEqual(data['payload']["foobar"], 1) # Item is returned to the queue on error raise SyntaxError except SyntaxError: pass job = self.queue.next() self.assertEqual(job.data['attempts'], 1) def test_context_manager_complete(self): self.queue.put({"foobar": 1}) job = self.queue.next() with job as data: self.assertEqual(data['payload']["foobar"], 1) job = self.queue.next() self.assertEqual(job, None) def test_next_by_payload(self): self.queue.put({"type": "first_type", "param": "param1"}) self.queue.put({"type": "second_type", "param": "param2"}) self.queue.put({"type": "third_type", "param": "param3"}) job = self.queue.next({"payload.type": "second_type"}) with job as data: self.assertEqual(data["payload"]["param"], "param2") job = self.queue.next({"payload.type": "third_type"}) with job as data: self.assertEqual(data["payload"]["param"], "param3") job = self.queue.next({"payload.type": "fourth_type"}) self.assertEqual(job, None) job = self.queue.next() with job as data: self.assertEqual(data["payload"]["param"], "param1") job = self.queue.next() self.assertEqual(job, None)
class MongoQueueTest(TestCase): def setUp(self): self.client = pymongo.Connection(os.environ.get("TEST_MONGODB")) self.db = self.client.test_queue self.queue = MongoQueue(self.db.queue_1, "consumer_1") def tearDown(self): self.client.drop_database("test_queue") def assert_job_equal(self, job, data): for k, v in data.items(): self.assertEqual(job.payload[k], v) def test_put_next(self): data = {"context_id": "alpha", "data": [1, 2, 3], "more-data": time.time()} self.queue.put(dict(data)) job = self.queue.next() self.assert_job_equal(job, data) def test_get_empty_queue(self): job = self.queue.next() self.assertEqual(job, None) def test_priority(self): data = {"priority": 1, "name": "hello world"} self.queue.put({ "priority": 1, "name": "alice"}) self.queue.put({ "priority": 2, "name": "bob"}) self.queue.put({ "priority": 0, "name": "mike"}) self.assertEqual( ["bob", "alice", "mike"], [self.queue.next().payload['name'], self.queue.next().payload['name'], self.queue.next().payload['name']]) def test_complete(self): data = {"context_id": "alpha", "data": [1, 2, 3], "more-data": datetime.now()} self.queue.put(data) self.assertEqual(self.queue.size(), 1) job = self.queue.next() job.complete() self.assertEqual(self.queue.size(), 0) def test_release(self): data = {"context_id": "alpha", "data": [1, 2, 3], "more-data": time.time()} self.queue.put(data) job = self.queue.next() job.release() self.assertEqual(self.queue.size(), 1) job = self.queue.next() self.assert_job_equal(job, data) def test_error(self): pass def test_progress(self): pass def test_stats(self): for i in range(5): data = {"context_id": "alpha", "data": [1, 2, 3], "more-data": time.time()} self.queue.put(data) job = self.queue.next() job.error("problem") stats = self.queue.stats() self.assertEqual({'available': 5, 'total': 5, 'locked': 0, 'errors': 0}, stats) def test_context_manager_error(self): self.queue.put({"foobar": 1}) job = self.queue.next() try: with job as data: self.assertEqual(data['payload']["foobar"], 1) # Item is returned to the queue on error raise SyntaxError except SyntaxError: pass job = self.queue.next() self.assertEqual(job.data['attempts'], 1) def test_context_manager_complete(self): self.queue.put({"foobar": 1}) job = self.queue.next() with job as data: self.assertEqual(data['payload']["foobar"], 1) job = self.queue.next() self.assertEqual(job, None)
class MongoQueueTest(TestCase): def setUp(self): self.client = pymongo.Connection(os.environ.get("TEST_MONGODB")) self.db = self.client.test_queue self.queue = MongoQueue(self.db.queue_1, "consumer_1") def tearDown(self): self.client.drop_database("test_queue") def assert_job_equal(self, job, data): for k, v in data.items(): self.assertEqual(job.payload[k], v) def test_put_next(self): data = { "context_id": "alpha", "data": [1, 2, 3], "more-data": time.time() } self.queue.put(dict(data)) job = self.queue.next() self.assert_job_equal(job, data) def test_get_empty_queue(self): job = self.queue.next() self.assertEqual(job, None) def test_priority(self): self.queue.put({"name": "alice"}, priority=1) self.queue.put({"name": "bob"}, priority=2) self.queue.put({"name": "mike"}, priority=0) self.assertEqual(["bob", "alice", "mike"], [ self.queue.next().payload['name'], self.queue.next().payload['name'], self.queue.next().payload['name'] ]) def test_complete(self): data = { "context_id": "alpha", "data": [1, 2, 3], "more-data": datetime.now() } self.queue.put(data) self.assertEqual(self.queue.size(), 1) job = self.queue.next() job.complete() self.assertEqual(self.queue.size(), 0) def test_release(self): data = { "context_id": "alpha", "data": [1, 2, 3], "more-data": time.time() } self.queue.put(data) job = self.queue.next() job.release() self.assertEqual(self.queue.size(), 1) job = self.queue.next() self.assert_job_equal(job, data) def test_error(self): pass def test_progress(self): pass def test_stats(self): for i in range(5): data = { "context_id": "alpha", "data": [1, 2, 3], "more-data": time.time() } self.queue.put(data) job = self.queue.next() job.error("problem") stats = self.queue.stats() self.assertEqual({ 'available': 5, 'total': 5, 'locked': 0, 'errors': 0 }, stats) def test_context_manager_error(self): self.queue.put({"foobar": 1}) job = self.queue.next() try: with job as data: self.assertEqual(data['payload']["foobar"], 1) # Item is returned to the queue on error raise SyntaxError except SyntaxError: pass job = self.queue.next() self.assertEqual(job.data['attempts'], 1) def test_context_manager_complete(self): self.queue.put({"foobar": 1}) job = self.queue.next() with job as data: self.assertEqual(data['payload']["foobar"], 1) job = self.queue.next() self.assertEqual(job, None)