def testExcluded(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/3') domaininfo.excluded = 1 enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 1, r assert r['saved'] == 0, r dispatcher.shutdown() # print exclude qfile content for q in py.path.local(dispatcher.excludedlist.qdir).listdir( fil=lambda p: p.ext == '.gz'): with gzip.open(str(q)) as f: print f.read() items = readqueue(dispatcher.excludedlist.qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']
def testRegular(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/1') enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 1, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(scheduler.curis) == 1 assert scheduler.curis[0]['u'] == curi['u']
def testSeen(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi1 = dict(u='http://test.example.com/2') dispatcher.init_seen() dispatcher.seen.already_seen(curi1) enq.queue([curi1]) enq.close() #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1) r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(scheduler.curis) == 0, scheduler.curis
def testOutOfScope(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/') scheduler._client_active = False enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 1, r dispatcher.shutdown() items = readqueue(dispatcher.diverter.getqueue('0').qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']