Exemplo n.º 1
0
class WorkSet(object):

    def __init__(self, wsdir, wsid, writing=False, reading=True):
        self.wsid = wsid

        self.qdir = os.path.join(wsdir, str(self.wsid))

        if writing:
            FileEnqueue.recover(self.qdir)
            self.enq = FileEnqueue(self.qdir, buffer=200)
        else:
            self.enq = DummyFileEnqueue(self.qdir)
        if reading:
            self.deq = FileDequeue(self.qdir)
        else:
            # dummy?
            self.deq = None

        self.running = True

        self.scheduledcount = 0
        self.checkedoutcount = 0
        self.finishedcount = 0
        self.activecount = 0

    def flush(self):
        # _flush() should be part of close(), but not now
        self.enq._flush()
        self.enq.close()
        
    def shutdown(self):
        self.flush()
        if self.deq:
            self.deq.close()

    def get_status(self):
        r = dict(id=self.wsid, running=self.running,
                 scheduled=self.scheduledcount,
                 checkedout=self.checkedoutcount,
                 finished=self.finishedcount
                 )
        if self.enq: r['enq'] = self.enq.get_status()
        if self.deq: r['deq'] = self.deq.get_status()
        return r

    def schedule(self, curi):
        self.enq.queue(curi)
        self.scheduledcount += 1

    def checkout(self, n):
        if not self.running:
            return []
        r = []
        while len(r) < n:
            curi = self.deq.get(timeout=0.001)
            if curi is None:
                # avoid flushing queue too frequently
                if self.enq.queue_count > 10000:
                    self.enq.close()
                break
            r.append(curi)
        self.checkedoutcount += len(r)
        return r
    
    def deschedule(self, furi):
        self.finishedcount += 1
Exemplo n.º 2
0
class IncomingQueue(object):
    # default maxsize 1GB - this would be too big for multi-queue
    # settings
    def __init__(self, qdir, noupdate=False, norecover=False, **kw):
        # ensure qdir directory exists
        self.qdir = qdir
        if not os.path.isdir(self.qdir):
            os.makedirs(self.qdir)

        self.addedcount = 0
        self.processedcount = 0

        self.rqfile = None
        self.qfiles = None

        if not norecover:
            FileEnqueue.recover(self.qdir)
        self.init_queues(**kw)

    def init_queues(self, buffsize=0, maxsize=1000*1000*1000):
        # dequeue side
        self.rqfile = FileDequeue(self.qdir)
        # single queue file, no asynchronous writes
        self.qfiles = [FileEnqueue(self.qdir, buffer=buffsize,
                                   maxsize=maxsize)]

    @property
    def buffsize(self):
        self.qfiles[0].buffer_size
    @buffsize.setter
    def buffsize(self, v):
        for enq in self.qfiles:
            enq.buffer_size = v

    def __del__(self):
        self.shutdown()

    def close(self, blocking=True):
        if self.qfiles:
            for q in self.qfiles:
                q.close(blocking=blocking)
            
    def flush(self):
        if self.qfiles:
            for q in self.qfiles:
                q._flush()

    def shutdown(self):
        if self.rqfile:
            self.rqfile.close()
        # _flush should be part of close, but not now.
        self.flush()
        self.write_executor.shutdown()
        self.close()

    def get_status(self):
        buffered = sum([enq.buffered_count for enq in self.qfiles])
        r = dict(addedcount=self.addedcount,
                 processedcount=self.processedcount,
                 queuefilecount=self.rqfile.qfile_count(),
                 dequeue=self.rqfile.get_status(),
                 bufferedcount=buffered
                 )
        if self.rqfile:
            r['queuefilecount'] = self.rqfile.qfile_count()
        return r

    def add(self, curis):
        result = dict(processed=0)
        for curi in curis:
            enq = self.qfiles[0]
            enq.queue(curi)

            self.addedcount += 1
            result['processed'] += 1
        return result

    def get(self, timeout=0.0):
        o = self.rqfile.get(timeout)
        # if queue exhausted, try closing current enq
        # leave busy queues
        if not o:
            self.close(blocking=False)
        if o: self.processedcount += 1
        return o
Exemplo n.º 3
0
class CrawlJob(object):
    def __init__(self, jobconfigs, jobname, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.qdir = hqconfig.inqdir(self.jobname)

        self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader)

        self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.domaininfo = domaininfo
        
        self.scheduler = Scheduler(self.jobname, self.mapper)

    def shutdown(self):
        logging.info("closing seen db")
        self.seen.close()
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("done.")

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq and self.inq.get_status()
        return r

    # def get_workset_status(self):
    #     r = dict(job=self.jobname, crawljob=id(self))
    #     if self.scheduler:
    #         r['sch'] = id(self.scheduler)
    #         r['worksets'] = self.scheduler.get_workset_status()
    #     return r
        
    def get_domaininfo(self, url):
        uc = urlsplit(url)
        host = uc.netloc
        p = host.find(':')
        if p > 0: host = host[:p]
        di = self.domaininfo.get(host)
        return di
        
    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            #self.scheduler.schedule(curi)
            ws = self.mapper.workset(curi)
            self.worksets[ws].schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''
        result = dict(processed=0, scheduled=0, excluded=0, td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            di = self.get_domaininfo(furi['u'])
            if di and di['exclude']:
                result['excluded'] += 1
                continue
            t0 = time.time()
            suri = self.seen.already_seen(furi)
            if suri['e'] < int(time.time()):
                if 'w' in furi:
                    w = furi['w']
                else:
                    w = dict()
                    for k in ('p','v','x'):
                        m = furi.get(k)
                        if m is not None:
                            w[k] = m
                curi = dict(u=furi['u'], id=suri['_id'], a=w)
                self.scheduler.schedule(curi)
                result['scheduled'] += 1
            result['ts'] += (time.time() - t0)
        # currently no access to MongoDB
        #self.mongo.end_request()
        return result

    def makecuri(self, o):
        return o

    def flush(self):
        self.seen.flush()
        return self.scheduler.flush()