Пример #1
0
class WorkSet(object):

    def __init__(self, wsdir, wsid, writing=False):
        self.wsid = wsid

        self.qdir = os.path.join(wsdir, str(self.wsid))

        if writing:
            FileEnqueue.recover(self.qdir)
            self.enq = FileEnqueue(self.qdir, buffer=200)
        else:
            self.enq = DummyFileEnqueue(self.qdir)
        self.deq = FileDequeue(self.qdir)

        self.running = True

        self.scheduledcount = 0
        self.checkedoutcount = 0
        self.finishedcount = 0
        self.activecount = 0

    def flush(self):
        # _flush() should be part of close(), but not now
        self.enq._flush()
        self.enq.close()
        
    def shutdown(self):
        self.flush()
        self.deq.close()

    def get_status(self):
        r = dict(id=self.wsid, running=self.running,
                 scheduled=self.scheduledcount,
                 checkedout=self.checkedoutcount,
                 finished=self.finishedcount
                 )
        return r

    def schedule(self, curi):
        self.enq.queue(curi)
        self.scheduledcount += 1

    def checkout(self, n):
        if not self.running:
            return []
        r = []
        while len(r) < n:
            curi = self.deq.get(timeout=0.001)
            if curi is None:
                self.enq.close()
                break
            r.append(curi)
        self.checkedoutcount += len(r)
        return r
    
    def deschedule(self, furi):
        self.finishedcount += 1
Пример #2
0
    def __init__(self, domaininfo, job, mapper,
                 scheduler, inq=None):
        self.domaininfo = domaininfo
        self.jobname = job
        self.mapper = mapper
        self.scheduler = scheduler

        # TODO: inject these objects from outside
        qdir = hqconfig.inqdir(self.jobname)
        self.inq = inq
        if self.inq is None:
            self.inq = FileDequeue(qdir, reader=FPSortingQueueFileReader)
        # seen database is initialized lazily
        #self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.seen = None
        self.diverter = Diverter(self.jobname, self.mapper)
        self.excludedlist = ExcludedList(self.jobname)

        self.workset_state = [0 for i in range(self.mapper.nworksets)]

        # TODO: this could be combined with FileDequeue above in a single class
        if Dispatcher.inqwatcher is None:
            iqw = Dispatcher.inqwatcher = InqueueWatcher()
            iqw.start()
        self.watch = Dispatcher.inqwatcher.addwatch(hqconfig.inqdir(self.jobname))
Пример #3
0
    def __init__(self, jobconfigs, jobname, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.qdir = hqconfig.inqdir(self.jobname)

        self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader)

        self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.domaininfo = domaininfo
        
        self.scheduler = Scheduler(self.jobname, self.mapper)
Пример #4
0
    def __init__(self, wsdir, wsid, writing=False):
        self.wsid = wsid

        self.qdir = os.path.join(wsdir, str(self.wsid))

        if writing:
            FileEnqueue.recover(self.qdir)
            self.enq = FileEnqueue(self.qdir, buffer=200)
        else:
            self.enq = DummyFileEnqueue(self.qdir)
        self.deq = FileDequeue(self.qdir)

        self.running = True

        self.scheduledcount = 0
        self.checkedoutcount = 0
        self.finishedcount = 0
        self.activecount = 0
Пример #5
0
    def __init__(self, job, qdirbase, splitter):

        self.job = job
        self.splitter = splitter
        # ensure job directory exists
        self.qdir = os.path.join(qdirbase, job)
        if not os.path.isdir(self.qdir):
            os.makedirs(self.qdir)

        self.addedcount = 0
        self.processedcount = 0

        self.maxsize = 1000*1000*1000 # 1GB

        self.queue_writer = AsyncFlusher()
        self.enqs = [FileEnqueue(self.qdir, suffix=str(win))
                     for win in range(self.splitter.nqueues)]

        # dequeue side
        #self.lastqfile = None
        self.rqfile = FileDequeue(self.qdir)

        self.qfile_read = 0
        self.qfile_written = 0
Пример #6
0
class SplitIncomingQueue(object):
    '''IncomingQueue variant that stores incoming URLs into
       multiple queue files, grouping by id range. This scheme
       has the same effect with merge sort and makes seen check
       much faster.'''
    def __init__(self, job, qdirbase, splitter):

        self.job = job
        self.splitter = splitter
        # ensure job directory exists
        self.qdir = os.path.join(qdirbase, job)
        if not os.path.isdir(self.qdir):
            os.makedirs(self.qdir)

        self.addedcount = 0
        self.processedcount = 0

        self.maxsize = 1000*1000*1000 # 1GB

        self.queue_writer = AsyncFlusher()
        self.enqs = [FileEnqueue(self.qdir, suffix=str(win))
                     for win in range(self.splitter.nqueues)]

        # dequeue side
        #self.lastqfile = None
        self.rqfile = FileDequeue(self.qdir)

        self.qfile_read = 0
        self.qfile_written = 0

    def __del__(self):
        self.close()

    def close(self):
        self.write_executor.shutdown()
        for enq in self.enqs:
            enq.close()

    def shutdown(self):
        self.close()

    def hash(self, curi):
        if 'id' in curi:
            return curi['id']
        else:
            h = Seen.urikey(curi['u'])
            curi['id'] = h
            return h

    def add(self, curis):
        result = dict(processed=0)
        for curi in curis:
            h = self.hash(curi)
            win = (h >> self.window_bits) & self.win_mask
            enq = self.enqs[win]
            enq.queue(curi)

            self.addedcount += 1
            result['processed'] += 1
        return result

    def get(self, timeout=0.0):
        o = self.rqfile.get(timeout)
        # TODO: if queue exhausted, try closing largest enq
        if o: self.processedcount += 1
        return o
Пример #7
0
 def init_queues(self, buffsize=0, maxsize=1000*1000*1000):
     # dequeue side
     self.rqfile = FileDequeue(self.qdir)
     # single queue file, no asynchronous writes
     self.qfiles = [FileEnqueue(self.qdir, buffer=buffsize,
                                maxsize=maxsize)]
Пример #8
0
class IncomingQueue(object):
    # default maxsize 1GB - this would be too big for multi-queue
    # settings
    def __init__(self, qdir, noupdate=False, norecover=False, **kw):
        # ensure qdir directory exists
        self.qdir = qdir
        if not os.path.isdir(self.qdir):
            os.makedirs(self.qdir)

        self.addedcount = 0
        self.processedcount = 0

        self.rqfile = None
        self.qfiles = None

        if not norecover:
            FileEnqueue.recover(self.qdir)
        self.init_queues(**kw)

    def init_queues(self, buffsize=0, maxsize=1000*1000*1000):
        # dequeue side
        self.rqfile = FileDequeue(self.qdir)
        # single queue file, no asynchronous writes
        self.qfiles = [FileEnqueue(self.qdir, buffer=buffsize,
                                   maxsize=maxsize)]

    @property
    def buffsize(self):
        self.qfiles[0].buffer_size
    @buffsize.setter
    def buffsize(self, v):
        for enq in self.qfiles:
            enq.buffer_size = v

    def __del__(self):
        self.shutdown()

    def close(self, blocking=True):
        if self.qfiles:
            for q in self.qfiles:
                q.close(blocking=blocking)
            
    def flush(self):
        if self.qfiles:
            for q in self.qfiles:
                q._flush()

    def shutdown(self):
        if self.rqfile:
            self.rqfile.close()
        # _flush should be part of close, but not now.
        self.flush()
        self.write_executor.shutdown()
        self.close()

    def get_status(self):
        buffered = sum([enq.buffered_count for enq in self.qfiles])
        r = dict(addedcount=self.addedcount,
                 processedcount=self.processedcount,
                 queuefilecount=self.rqfile.qfile_count(),
                 dequeue=self.rqfile.get_status(),
                 bufferedcount=buffered
                 )
        if self.rqfile:
            r['queuefilecount'] = self.rqfile.qfile_count()
        return r

    def add(self, curis):
        result = dict(processed=0)
        for curi in curis:
            enq = self.qfiles[0]
            enq.queue(curi)

            self.addedcount += 1
            result['processed'] += 1
        return result

    def get(self, timeout=0.0):
        o = self.rqfile.get(timeout)
        # if queue exhausted, try closing current enq
        # leave busy queues
        if not o:
            self.close(blocking=False)
        if o: self.processedcount += 1
        return o
Пример #9
0
class CrawlJob(object):
    def __init__(self, jobconfigs, jobname, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.qdir = hqconfig.inqdir(self.jobname)

        self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader)

        self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.domaininfo = domaininfo
        
        self.scheduler = Scheduler(self.jobname, self.mapper)

    def shutdown(self):
        logging.info("closing seen db")
        self.seen.close()
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("done.")

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq and self.inq.get_status()
        return r

    # def get_workset_status(self):
    #     r = dict(job=self.jobname, crawljob=id(self))
    #     if self.scheduler:
    #         r['sch'] = id(self.scheduler)
    #         r['worksets'] = self.scheduler.get_workset_status()
    #     return r
        
    def get_domaininfo(self, url):
        uc = urlsplit(url)
        host = uc.netloc
        p = host.find(':')
        if p > 0: host = host[:p]
        di = self.domaininfo.get(host)
        return di
        
    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            #self.scheduler.schedule(curi)
            ws = self.mapper.workset(curi)
            self.worksets[ws].schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''
        result = dict(processed=0, scheduled=0, excluded=0, td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            di = self.get_domaininfo(furi['u'])
            if di and di['exclude']:
                result['excluded'] += 1
                continue
            t0 = time.time()
            suri = self.seen.already_seen(furi)
            if suri['e'] < int(time.time()):
                if 'w' in furi:
                    w = furi['w']
                else:
                    w = dict()
                    for k in ('p','v','x'):
                        m = furi.get(k)
                        if m is not None:
                            w[k] = m
                curi = dict(u=furi['u'], id=suri['_id'], a=w)
                self.scheduler.schedule(curi)
                result['scheduled'] += 1
            result['ts'] += (time.time() - t0)
        # currently no access to MongoDB
        #self.mongo.end_request()
        return result

    def makecuri(self, o):
        return o

    def flush(self):
        self.seen.flush()
        return self.scheduler.flush()
Пример #10
0
class Dispatcher(object):
    inqwatcher = None

    # TODO: take JobConfig, instead of job
    def __init__(self, domaininfo, job, mapper,
                 scheduler, inq=None):
        self.domaininfo = domaininfo
        self.jobname = job
        self.mapper = mapper
        self.scheduler = scheduler

        # TODO: inject these objects from outside
        qdir = hqconfig.inqdir(self.jobname)
        self.inq = inq
        if self.inq is None:
            self.inq = FileDequeue(qdir, reader=FPSortingQueueFileReader)
        # seen database is initialized lazily
        #self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.seen = None
        self.diverter = Diverter(self.jobname, self.mapper)
        self.excludedlist = ExcludedList(self.jobname)

        self.workset_state = [0 for i in range(self.mapper.nworksets)]

        # TODO: this could be combined with FileDequeue above in a single class
        if Dispatcher.inqwatcher is None:
            iqw = Dispatcher.inqwatcher = InqueueWatcher()
            iqw.start()
        self.watch = Dispatcher.inqwatcher.addwatch(hqconfig.inqdir(self.jobname))

    def shutdown(self):
        #if self.job: self.job.shutdown()
        if self.seen:
            logging.info("closing seen db")
            self.seen.close()
            self.seen = None
        # logging.info("shutting down scheduler")
        # self.scheduler.shutdown()
        logging.info("shutting down diverter")
        self.diverter.shutdown()
        logging.info("shutting down excludedlist")
        self.excludedlist.shutdown()
        logging.info("done.")

    def flush(self):
        """flushes URIs buffered in workset objects"""
        #return self.job.flush()
        
    def is_client_active(self, clid):
        """is client clid active?"""
        # TODO: update ZooKeeper when active status changes
        #t = self.client_last_active.get(str(clid))
        return self.scheduler.is_active(clid)

    def is_workset_active(self, wsid):
        """is workset wsid assigned to any active client?"""
        clid = self.mapper.worksetclient[wsid]
        return self.is_client_active(clid)

    def workset_activating(self, wsid):
        """activates working set wsid; start sending CURIs to Scheduler
        and enqueue diverted CURIs back into incoming queue so that
        processinq will process them (again). called by Scheduler,
        through CrawlMapper, when client starts feeding.
        note, unlike workset_deactivating, this method shall not be
        called from inside processinq method below, because processinq
        executes it only when at least one CURI is available for processing.
        if inq is empty, CURIs in divert queues would never be enqueued back.
        """
        # this could be executed asynchronously
        logging.info('workset %s activated', wsid)
        self.workset_state[wsid] = 1
        # is it better to move files back into inq directory?
        qfiles = self.diverter.listqfiles(wsid)
        logging.info('re-scheduling %s to inq', str(qfiles))
        self.inq.qfiles_available(qfiles)

    def workset_deactivating(self, wsid):
        """deactivates working set wsid; start sending CURIs into
        divert queues."""
        logging.info('workset %s deactivated', wsid)
        self.workset_state[wsid] = 0
        # flush Workset queues. we don't move qfiles to diverter yet.
        # it will be done when other HQ server becomes active on the
        # workset, and this HQ server starts forwarding CURIs.
        self.scheduler.flush_workset(wsid)

    def init_seen(self):
        if not self.seen:
            try:
                cachesize = hqconfig.get('seencache')
                if cachesize: cachesize = int(cachesize)*(1024**2)
            except:
                cachesize = None
            self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                             block_cache_size=cachesize)

    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        self.init_seen()

        result = dict(processed=0, scheduled=0, excluded=0, saved=0,
                      td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            ws = self.mapper.workset(furi)
            if self.is_workset_active(ws):
                # no need to call self.workset_activating(). it's already
                # done by Scheduler
                di = self.domaininfo.get_byurl(furi['u'])
                if di and di['exclude']:
                    self.excludedlist.add(furi)
                    result['excluded'] += 1
                    continue
                t0 = time.time()
                suri = self.seen.already_seen(furi)
                if suri['e'] < int(time.time()):
                    curi = dict(u=furi['u'], id=suri['_id'])
                    a = furi.get('w')
                    if not isinstance(a, dict): a = furi
                    for k in 'pvx':
                        m = a.get(k)
                        if m is not None: curi[k] = m
                    self.scheduler.schedule(curi)
                    result['scheduled'] += 1
                result['ts'] += (time.time() - t0)
            else:
                if self.workset_state[ws]:
                    self.workset_deactivating(ws)
                # client is not active
                self.diverter.divert(str(ws), furi)
                result['saved'] += 1
        return result

    def wait_available(self, timeout=None):
        return self.watch.wait(timeout)