class FifoDiskPriorityQueueTest(FifoMemoryPriorityQueueTest): def setUp(self): self.q = PriorityQueue(self.qfactory) def qfactory(self, prio): return track_closed(FifoDiskQueue)(self.mktemp()) def test_nonserializable_object_one(self): self.assertRaises(TypeError, self.q.push, lambda x: x, 0) self.assertEqual(self.q.close(), []) def test_nonserializable_object_many_close(self): self.q.push('a', 3) self.q.push('b', 1) self.assertRaises(TypeError, self.q.push, lambda x: x, 0) self.q.push('c', 2) self.assertEqual(self.q.pop(), 'b') self.assertEqual(sorted(self.q.close()), [2, 3]) def test_nonserializable_object_many_pop(self): self.q.push('a', 3) self.q.push('b', 1) self.assertRaises(TypeError, self.q.push, lambda x: x, 0) self.q.push('c', 2) self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), None) self.assertEqual(self.q.close(), [])
class PriorityQueueTest(unittest.TestCase): def setUp(self): qfactory = lambda x: TestMemoryQueue() self.q = PriorityQueue(qfactory) def test_push_pop_noprio(self): self.q.push('a') self.q.push('b') self.q.push('c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), None) def test_push_pop_prio(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'd') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), None) def test_len_nonzero(self): assert not self.q self.assertEqual(len(self.q), 0) self.q.push('a', 3) assert self.q self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) self.assertEqual(len(self.q), 4) self.q.pop() self.q.pop() self.q.pop() self.q.pop() assert not self.q self.assertEqual(len(self.q), 0) def test_close(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) iqueues = self.q.queues.values() self.assertEqual(sorted(self.q.close()), [1, 2, 3]) assert all(q.closed for q in iqueues) def test_popped_internal_queues_closed(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) p1queue = self.q.queues[1] self.assertEqual(self.q.pop(), 'b') self.q.close() assert p1queue.closed
class Scheduler(object): def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser self.stats = stats @classmethod def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats) def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close(reason) def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return if not self._dqpush(request): self._mqpush(request) def next_request(self): return self.mqs.pop() or self._dqpop() def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError, e: # non serializable request if self.logunser: log.msg("Unable to serialize request: %s - reason: %s" % \ (request, str(e)), level=log.ERROR, spider=self.spider) return else:
class Scheduler(object): def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser @classmethod def from_settings(cls, settings): dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser) def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close(reason) def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return if not self._dqpush(request): self._mqpush(request) def next_request(self): return self.mqs.pop() or self._dqpop() def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError, e: # non serializable request if self.logunser: log.msg("Unable to serialize request: %s - reason: %s" % \ (request, str(e)), level=log.ERROR, spider=self.spider) return else:
class FifoMemoryPriorityQueueTest(unittest.TestCase): def setUp(self): self.q = PriorityQueue(self.qfactory) def qfactory(self, prio): return track_closed(FifoMemoryQueue)() def test_push_pop_noprio(self): self.q.push('a') self.q.push('b') self.q.push('c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), None) def test_push_pop_prio(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) self.assertEqual(self.q.pop(), 'b') self.assertEqual(self.q.pop(), 'd') self.assertEqual(self.q.pop(), 'c') self.assertEqual(self.q.pop(), 'a') self.assertEqual(self.q.pop(), None) def test_len_nonzero(self): assert not self.q self.assertEqual(len(self.q), 0) self.q.push('a', 3) assert self.q self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) self.assertEqual(len(self.q), 4) self.q.pop() self.q.pop() self.q.pop() self.q.pop() assert not self.q self.assertEqual(len(self.q), 0) def test_close(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) self.q.push('d', 1) iqueues = self.q.queues.values() self.assertEqual(sorted(self.q.close()), [1, 2, 3]) assert all(q.closed for q in iqueues) def test_close_return_active(self): self.q.push('b', 1) self.q.push('c', 2) self.q.push('a', 3) self.q.pop() self.assertEqual(sorted(self.q.close()), [2, 3]) def test_popped_internal_queues_closed(self): self.q.push('a', 3) self.q.push('b', 1) self.q.push('c', 2) p1queue = self.q.queues[1] self.assertEqual(self.q.pop(), 'b') self.q.close() assert p1queue.closed
class Scheduler(object): def __init__(self, dupefilter, jobdir=None, dqclass=None): self.df = dupefilter self.dqdir = join(jobdir, 'requests.queue') if jobdir else None self.dqclass = dqclass @classmethod def from_settings(cls, settings): dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) return cls(dupefilter, job_dir(settings), dqclass) def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return if not self._dqpush(request): self._mqpush(request) def next_request(self): return self.mqs.pop() or self._dqpop() def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, request.priority) except ValueError: # non serializable request return else: stats.inc_value('scheduler/disk_enqueued', spider=self.spider) return True def _mqpush(self, request): stats.inc_value('scheduler/memory_enqueued', spider=self.spider) self.mqs.push(request, request.priority) def _dqpop(self): if self.dqs: d = self.dqs.pop() if d: return request_from_dict(d, self.spider) def _newmq(self, priority): return MemoryQueue() def _newdq(self, priority): return self.dqclass(join(self.dqdir, 'p%s' % priority)) def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: prios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: log.msg("Resuming crawl (%d requests scheduled)" % len(q), \ spider=self.spider) return q
class Scheduler(SettingObject): dupfilter_class = StringField(default="scrapy.dupefilter.RFPDupeFilter") schedule_disk_queue = StringField(default="scrapy.squeue.PickleLifoDiskQueue") schedule_memory_queue = StringField(default="scrapy.squeue.LifoMemoryQueue") log_unserailizable_requests = BooleanField(default=False) jobdir = StringField(default="") def __init__(self, settings): super(Scheduler, self).__init__(settings) dupefilter_cls = load_object(self.dupfilter_class.to_value()) dupefilter = dupefilter_cls(self.metas) dqclass = load_object(self.schedule_disk_queue.to_value()) mqclass = load_object(self.schedule_memory_queue.to_value()) logunser = self.log_unserailizable_requests.to_value() self.df = dupefilter self.jobpath = self.__job_dir(self.jobdir.to_value()) self.dqdir = self._dqdir(self.jobpath) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser def __job_dir(self, path): if path and not os.path.exists(path): os.makedirs(path) return path def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close(reason) def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return if not self._dqpush(request): self._mqpush(request) def next_request(self): return self.mqs.pop() or self._dqpop() def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError, e: # non serializable request if self.logunser: log.msg("Unable to serialize request: %s - reason: %s" % \ (request, str(e)), level=log.ERROR, spider=self.spider) return else: