class FifoDiskPriorityQueueTest(FifoMemoryPriorityQueueTest): def setUp(self): self.q = PriorityQueue(self.qfactory) def qfactory(self, prio): return track_closed(FifoDiskQueue)(self.mktemp()) def test_nonserializable_object_one(self): self.assertRaises(TypeError, self.q.push, lambda x: x, 0) self.assertEqual(self.q.close(), []) def test_nonserializable_object_many_close(self): self.q.push('a', 3) self.q.push('b', 1) self.assertRaises(TypeError, self.q.push, lambda x: x, 0) self.q.push('c', 2) self.assertEqual(self.q.pop(), 'b') self.assertEqual(sorted(self.q.close()), [2, 3]) def test_nonserializable_object_many_pop(self): self.q.push('a', 3) self.q.push('b', 1) self.assertRaises(TypeError, self.q.push, lambda x: x, 0) self.q.push('c', 2) self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), None) self.assertEqual(self.q.close(), [])
class FifoMemoryPriorityQueueTest(unittest.TestCase): def setUp(self): self.q = PriorityQueue(self.qfactory) def qfactory(self, prio): return track_closed(FifoMemoryQueue)() def test_push_pop_noprio(self): self.q.push('a') self.q.push('b') self.q.push('c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), None) def test_push_pop_prio(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'd') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), None) def test_len_nonzero(self): assert not self.q self.assertEqual(len(self.q), 0) self.q.push('a', 3) assert self.q self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) self.assertEqual(len(self.q), 4) self.q.pop() self.q.pop() self.q.pop() self.q.pop() assert not self.q self.assertEqual(len(self.q), 0) def test_close(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) iqueues = self.q.queues.values() self.assertEqual(sorted(self.q.close()), [1, 2, 3]) assert all(q.closed for q in iqueues) def test_close_return_active(self): self.q.push('b', 1) self.q.push('c', 2) self.q.push('a', 3) self.q.pop() self.assertEqual(sorted(self.q.close()), [2, 3]) def test_popped_internal_queues_closed(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) p1queue = self.q.queues[1] self.assertEqual(self.q.pop(), 'b') self.q.close() assert p1queue.closed
class Scheduler(object): def __init__(self, dupefilter, jobdir=None, dqclass=None): self.df = dupefilter self.dqdir = join(jobdir, 'requests.queue') if jobdir else None self.dqclass = dqclass @classmethod def from_settings(cls, settings): dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) return cls(dupefilter, job_dir(settings), dqclass) def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return if not self._dqpush(request): self._mqpush(request) def next_request(self): return self.mqs.pop() or self._dqpop() def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, request.priority) except ValueError: # non serializable request return else: stats.inc_value('scheduler/disk_enqueued', spider=self.spider) return True def _mqpush(self, request): stats.inc_value('scheduler/memory_enqueued', spider=self.spider) self.mqs.push(request, request.priority) def _dqpop(self): if self.dqs: d = self.dqs.pop() if d: return request_from_dict(d, self.spider) def _newmq(self, priority): return MemoryQueue() def _newdq(self, priority): return self.dqclass(join(self.dqdir, 'p%s' % priority)) def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: prios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: log.msg("Resuming crawl (%d requests scheduled)" % len(q), \ spider=self.spider) return q
class PriorityQueueTest(unittest.TestCase): def setUp(self): qfactory = lambda x: TestMemoryQueue() self.q = PriorityQueue(qfactory) def test_push_pop_noprio(self): self.q.push('a') self.q.push('b') self.q.push('c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), None) def test_push_pop_prio(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'd') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), None) def test_len_nonzero(self): assert not self.q self.assertEqual(len(self.q), 0) self.q.push('a', 3) assert self.q self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) self.assertEqual(len(self.q), 4) self.q.pop() self.q.pop() self.q.pop() self.q.pop() assert not self.q self.assertEqual(len(self.q), 0) def test_close(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) iqueues = self.q.queues.values() self.assertEqual(sorted(self.q.close()), [1, 2, 3]) assert all(q.closed for q in iqueues) def test_popped_internal_queues_closed(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) p1queue = self.q.queues[1] self.assertEqual(self.q.pop(), 'b') self.q.close() assert p1queue.closed