def testExcluded(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/3') domaininfo.excluded = 1 enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 1, r assert r['saved'] == 0, r dispatcher.shutdown() # print exclude qfile content for q in py.path.local(dispatcher.excludedlist.qdir).listdir( fil=lambda p: p.ext == '.gz'): with gzip.open(str(q)) as f: print f.read() items = readqueue(dispatcher.excludedlist.qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']
def init_dispatcher(self): if self.dispatcher: return self.dispatcher if self.dispatcher_mode == 'external': raise RuntimeError, 'dispatcher mode is %s' % self.dispatcher_mode self.dispatcher = LevelDispatcher(self.hq.get_domaininfo(), self.jobname, mapper=self.mapper, scheduler=self.scheduler, inq=self.inq.deq) return self.dispatcher
def testRegular(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/1') enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 1, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(scheduler.curis) == 1 assert scheduler.curis[0]['u'] == curi['u']
def testSeen(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi1 = dict(u='http://test.example.com/2') dispatcher.init_seen() dispatcher.seen.already_seen(curi1) enq.queue([curi1]) enq.close() #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1) r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(scheduler.curis) == 0, scheduler.curis
def testOutOfScope(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/') scheduler._client_active = False enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 1, r dispatcher.shutdown() items = readqueue(dispatcher.diverter.getqueue('0').qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']
class CrawlJob(object): def __init__(self, hq, jobname): self.hq = hq self.jobconfigs = self.hq.jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) readsorted = hqconfig.getint('inq.sort', 1) inqdir = hqconfig.inqdir(self.jobname) def enqfactory(qdir, **kwargs): return PriorityEnqueue(qdir, **kwargs) def deqfactory(qdir, **kwargs): if readsorted: kwargs.update(reader=FPSortingQueueFileReader) return PriorityDequeue(qdir, **kwargs) self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory, buffsize=1000) # self.eninq = PriorityEnqueue( # qdir=hqconfig.inqdir(self.jobname), # buffer=1000) # deinqargs = {} # if readsorted: # deinqargs['reader'] = FPSortingQueueFileReader # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname), # **deinqargs) self._dispatcher_mode = hqconfig.get( ('jobs', self.jobname, 'dispatcher'), 'internal') self.dispatcher = None #self.init_dispatcher() # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False self.last_inq_count = 0 self.addedcount = 0 self.processedcount = 0 PARAMS = [('use_crawlinfo', bool), ('save_crawlinfo', bool), ('dispatcher_mode', str)] @property def dispatcher_mode(self): return self._dispatcher_mode @dispatcher_mode.setter def dispatcher_mode(self, value): self._dispatcher_mode = value if value == 'external': self.shutdown_dispatcher() def init_dispatcher(self): if self.dispatcher: return self.dispatcher if self.dispatcher_mode == 'external': raise RuntimeError, 'dispatcher mode is %s' % self.dispatcher_mode self.dispatcher = LevelDispatcher(self.hq.get_domaininfo(), self.jobname, mapper=self.mapper, scheduler=self.scheduler, inq=self.inq.deq) return self.dispatcher def shutdown_dispatcher(self): if not self.dispatcher: return logging.info("shutting down dispatcher") self.dispatcher.shutdown() self.dispatcher = None def shutdown(self): logging.info("shutting down scheduler") self.scheduler.shutdown() logging.info("closing incoming queues") self.inq.flush() self.inq.close() self.shutdown_dispatcher() logging.info("done.") def get_status(self): r = dict(job=self.jobname, oid=id(self)) r['sch'] = self.scheduler and self.scheduler.get_status() r['inq'] = self.inq.get_status() return r def get_workset_status(self): r = dict(job=self.jobname, crawljob=id(self)) if self.scheduler: r['sch'] = id(self.scheduler) r['worksets'] = self.scheduler.get_workset_status() return r def workset_activating(self, *args): self.init_dispatcher().workset_activating(*args) def schedule(self, curis): '''schedule curis bypassing seen-check. typically used for starting new crawl cycle.''' scheduled = 0 for curi in curis: self.scheduler.schedule(curi) scheduled += 1 return dict(processed=scheduled, scheduled=scheduled) def discovered(self, curis): return self.inq.add(curis) def processinq(self, maxn): return self.init_dispatcher().processinq(maxn) def makecuri(self, o): # temporary rescue measure. delete after everything's got fixed. a = o.get('a') if isinstance(a, dict): for k in 'pvx': m = a.pop(k, None) if m is not None: o[k] = m if not o['a']: del o['a'] return o def feed(self, client, n): logging.debug('feed "%s" begin', client) curis = self.scheduler.feed(client, n) # add recrawl info if enabled if self.use_crawlinfo and len(curis) > 0 and self.hq.crawlinfo: t0 = time.time() self.hq.crawlinfo.update_crawlinfo(curis) t = time.time() - t0 if t / len(curis) > 1.0: logging.warn("SLOW update_crawlinfo: %s %.3fs/%d", client, t, len(curis)) self.hq.crawlinfo.mongo.end_request() r = [self.makecuri(u) for u in curis] # if client queue is empty, request incoming queue to flush if not r: # but do not flush too frequently. if self.addedcount > self.last_inq_count + 1000: self.inq.close() self.last_inq_count = self.addedcount return r def finished(self, curis): result = dict(processed=0) for curi in curis: self.scheduler.finished(curi) result['processed'] += 1 if self.save_crawlinfo and self.hq.crawlinfo: for curi in curis: self.hq.crawlinfo.save_result(curi) # XXX - until I come up with better design self.hq.crawlinfo.mongo.end_request() return result def reset(self, client): return self.scheduler.reset(client) def flush(self): self.inq.close() return self.scheduler.flush_clients() def count_seen(self): """return number of items in seen db. can take pretty long to return. """ return self.init_dispatcher().count_seen() def clear_seen(self): self.init_dispatcher().clear_seen()