class WorkSet(object): def __init__(self, wsdir, wsid, writing=False, reading=True): self.wsid = wsid self.qdir = os.path.join(wsdir, str(self.wsid)) if writing: FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=200) else: self.enq = DummyFileEnqueue(self.qdir) if reading: self.deq = FileDequeue(self.qdir) else: # dummy? self.deq = None self.running = True self.scheduledcount = 0 self.checkedoutcount = 0 self.finishedcount = 0 self.activecount = 0 def flush(self): # _flush() should be part of close(), but not now self.enq._flush() self.enq.close() def shutdown(self): self.flush() if self.deq: self.deq.close() def get_status(self): r = dict(id=self.wsid, running=self.running, scheduled=self.scheduledcount, checkedout=self.checkedoutcount, finished=self.finishedcount ) if self.enq: r['enq'] = self.enq.get_status() if self.deq: r['deq'] = self.deq.get_status() return r def schedule(self, curi): self.enq.queue(curi) self.scheduledcount += 1 def checkout(self, n): if not self.running: return [] r = [] while len(r) < n: curi = self.deq.get(timeout=0.001) if curi is None: # avoid flushing queue too frequently if self.enq.queue_count > 10000: self.enq.close() break r.append(curi) self.checkedoutcount += len(r) return r def deschedule(self, furi): self.finishedcount += 1
class IncomingQueue(object): # default maxsize 1GB - this would be too big for multi-queue # settings def __init__(self, qdir, noupdate=False, norecover=False, **kw): # ensure qdir directory exists self.qdir = qdir if not os.path.isdir(self.qdir): os.makedirs(self.qdir) self.addedcount = 0 self.processedcount = 0 self.rqfile = None self.qfiles = None if not norecover: FileEnqueue.recover(self.qdir) self.init_queues(**kw) def init_queues(self, buffsize=0, maxsize=1000*1000*1000): # dequeue side self.rqfile = FileDequeue(self.qdir) # single queue file, no asynchronous writes self.qfiles = [FileEnqueue(self.qdir, buffer=buffsize, maxsize=maxsize)] @property def buffsize(self): self.qfiles[0].buffer_size @buffsize.setter def buffsize(self, v): for enq in self.qfiles: enq.buffer_size = v def __del__(self): self.shutdown() def close(self, blocking=True): if self.qfiles: for q in self.qfiles: q.close(blocking=blocking) def flush(self): if self.qfiles: for q in self.qfiles: q._flush() def shutdown(self): if self.rqfile: self.rqfile.close() # _flush should be part of close, but not now. self.flush() self.write_executor.shutdown() self.close() def get_status(self): buffered = sum([enq.buffered_count for enq in self.qfiles]) r = dict(addedcount=self.addedcount, processedcount=self.processedcount, queuefilecount=self.rqfile.qfile_count(), dequeue=self.rqfile.get_status(), bufferedcount=buffered ) if self.rqfile: r['queuefilecount'] = self.rqfile.qfile_count() return r def add(self, curis): result = dict(processed=0) for curi in curis: enq = self.qfiles[0] enq.queue(curi) self.addedcount += 1 result['processed'] += 1 return result def get(self, timeout=0.0): o = self.rqfile.get(timeout) # if queue exhausted, try closing current enq # leave busy queues if not o: self.close(blocking=False) if o: self.processedcount += 1 return o
class CrawlJob(object): def __init__(self, jobconfigs, jobname, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.qdir = hqconfig.inqdir(self.jobname) self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader) self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS) self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.domaininfo = domaininfo self.scheduler = Scheduler(self.jobname, self.mapper) def shutdown(self): logging.info("closing seen db") self.seen.close() logging.info("shutting down scheduler") self.scheduler.shutdown() logging.info("done.") def get_status(self): r = dict(job=self.jobname, oid=id(self)) r['sch'] = self.scheduler and self.scheduler.get_status() r['inq'] = self.inq and self.inq.get_status() return r # def get_workset_status(self): # r = dict(job=self.jobname, crawljob=id(self)) # if self.scheduler: # r['sch'] = id(self.scheduler) # r['worksets'] = self.scheduler.get_workset_status() # return r def get_domaininfo(self, url): uc = urlsplit(url) host = uc.netloc p = host.find(':') if p > 0: host = host[:p] di = self.domaininfo.get(host) return di def schedule(self, curis): '''schedule curis bypassing seen-check. typically used for starting new crawl cycle.''' scheduled = 0 for curi in curis: #self.scheduler.schedule(curi) ws = self.mapper.workset(curi) self.worksets[ws].schedule(curi) scheduled += 1 return dict(processed=scheduled, scheduled=scheduled) def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' result = dict(processed=0, scheduled=0, excluded=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 di = self.get_domaininfo(furi['u']) if di and di['exclude']: result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): if 'w' in furi: w = furi['w'] else: w = dict() for k in ('p','v','x'): m = furi.get(k) if m is not None: w[k] = m curi = dict(u=furi['u'], id=suri['_id'], a=w) self.scheduler.schedule(curi) result['scheduled'] += 1 result['ts'] += (time.time() - t0) # currently no access to MongoDB #self.mongo.end_request() return result def makecuri(self, o): return o def flush(self): self.seen.flush() return self.scheduler.flush()