Пример #1
0
def testExcluded(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/3')
    domaininfo.excluded = 1

    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 1, r
    assert r['saved'] == 0, r

    dispatcher.shutdown()

    # print exclude qfile content
    for q in py.path.local(dispatcher.excludedlist.qdir).listdir(
        fil=lambda p: p.ext == '.gz'):
        with gzip.open(str(q)) as f:
            print f.read()

    items = readqueue(dispatcher.excludedlist.qdir)
    assert len(items) == 1, items
    assert isinstance(items[0], dict), items[0]
    assert items[0]['u'] == curi['u']
Пример #2
0
 def init_dispatcher(self):
     if self.dispatcher: return self.dispatcher
     if self.dispatcher_mode == 'external':
         raise RuntimeError, 'dispatcher mode is %s' % self.dispatcher_mode
     self.dispatcher = LevelDispatcher(self.hq.get_domaininfo(),
                                       self.jobname,
                                       mapper=self.mapper,
                                       scheduler=self.scheduler,
                                       inq=self.inq.deq)
     return self.dispatcher
Пример #3
0
def testRegular(testdatadir, domaininfo, mapper, scheduler):

    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)

    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/1')
    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 1, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 0, r

    assert len(scheduler.curis) == 1
    assert scheduler.curis[0]['u'] == curi['u']
Пример #4
0
def testSeen(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi1 = dict(u='http://test.example.com/2')
    dispatcher.init_seen()
    dispatcher.seen.already_seen(curi1)

    enq.queue([curi1])
    enq.close()

    #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1)

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 0, r

    assert len(scheduler.curis) == 0, scheduler.curis
Пример #5
0
def testOutOfScope(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/')
    scheduler._client_active = False

    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 1, r

    dispatcher.shutdown()

    items = readqueue(dispatcher.diverter.getqueue('0').qdir)
    assert len(items) == 1, items
    assert isinstance(items[0], dict), items[0]
    assert items[0]['u'] == curi['u']
Пример #6
0
class CrawlJob(object):

    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        readsorted = hqconfig.getint('inq.sort', 1)

        inqdir = hqconfig.inqdir(self.jobname)

        def enqfactory(qdir, **kwargs):
            return PriorityEnqueue(qdir, **kwargs)
        def deqfactory(qdir, **kwargs):
            if readsorted:
                kwargs.update(reader=FPSortingQueueFileReader)
            return PriorityDequeue(qdir, **kwargs)

        self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory,
                                 buffsize=1000)

        # self.eninq = PriorityEnqueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffer=1000)

        # deinqargs = {}
        # if readsorted:
        #     deinqargs['reader'] = FPSortingQueueFileReader
        # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname),
        #                              **deinqargs)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')

        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0

        self.addedcount = 0
        self.processedcount = 0

    PARAMS = [('use_crawlinfo', bool),
              ('save_crawlinfo', bool),
              ('dispatcher_mode', str)]

    @property
    def dispatcher_mode(self):
        return self._dispatcher_mode
    @dispatcher_mode.setter
    def dispatcher_mode(self, value):
        self._dispatcher_mode = value
        if value == 'external':
            self.shutdown_dispatcher()

    def init_dispatcher(self):
        if self.dispatcher: return self.dispatcher
        if self.dispatcher_mode == 'external':
            raise RuntimeError, 'dispatcher mode is %s' % self.dispatcher_mode
        self.dispatcher = LevelDispatcher(self.hq.get_domaininfo(),
                                          self.jobname,
                                          mapper=self.mapper,
                                          scheduler=self.scheduler,
                                          inq=self.inq.deq)
        return self.dispatcher

    def shutdown_dispatcher(self):
        if not self.dispatcher: return
        logging.info("shutting down dispatcher")
        self.dispatcher.shutdown()
        self.dispatcher = None

    def shutdown(self):
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("closing incoming queues")
        self.inq.flush()
        self.inq.close()
        self.shutdown_dispatcher()
        logging.info("done.")

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq.get_status()
        return r

    def get_workset_status(self):
        r = dict(job=self.jobname, crawljob=id(self))
        if self.scheduler:
            r['sch'] = id(self.scheduler)
            r['worksets'] = self.scheduler.get_workset_status()
        return r

    def workset_activating(self, *args):
        self.init_dispatcher().workset_activating(*args)

    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            self.scheduler.schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def discovered(self, curis):
        return self.inq.add(curis)

    def processinq(self, maxn):
        return self.init_dispatcher().processinq(maxn)

    def makecuri(self, o):
        # temporary rescue measure. delete after everything's got fixed.
        a = o.get('a')
        if isinstance(a, dict):
            for k in 'pvx':
                m = a.pop(k, None)
                if m is not None: o[k] = m
            if not o['a']:
                del o['a']
        return o

    def feed(self, client, n):
        logging.debug('feed "%s" begin', client)
        curis = self.scheduler.feed(client, n)
        # add recrawl info if enabled
        if self.use_crawlinfo and len(curis) > 0 and self.hq.crawlinfo:
            t0 = time.time()
            self.hq.crawlinfo.update_crawlinfo(curis)
            t = time.time() - t0
            if t / len(curis) > 1.0:
                logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
                             client, t, len(curis))
            self.hq.crawlinfo.mongo.end_request()
        r = [self.makecuri(u) for u in curis]
        # if client queue is empty, request incoming queue to flush
        if not r:
            # but do not flush too frequently.
            if self.addedcount > self.last_inq_count + 1000:
                self.inq.close()
                self.last_inq_count = self.addedcount
        return r

    def finished(self, curis):
        result = dict(processed=0)
        for curi in curis:
            self.scheduler.finished(curi)
            result['processed'] += 1
        if self.save_crawlinfo and self.hq.crawlinfo:
            for curi in curis:
                self.hq.crawlinfo.save_result(curi)
            # XXX - until I come up with better design
            self.hq.crawlinfo.mongo.end_request()
        return result

    def reset(self, client):
        return self.scheduler.reset(client)

    def flush(self):
        self.inq.close()
        return self.scheduler.flush_clients()

    def count_seen(self):
        """return number of items in seen db.
        can take pretty long to return.
        """
        return self.init_dispatcher().count_seen()

    def clear_seen(self):
        self.init_dispatcher().clear_seen()