def testSameUnseenURLsInInput(testdatadir, testdomaininfo, testmapper, testscheduler): inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) urls = generate_random_urls(100) seenurls = urls[:50] novelurls = urls[50:] seenfile = create_seen(dispatcher, seenurls) dupseenurls = [dict(url) for url in novelurls[:25]] input = urls + dupseenurls inq.add(input) inq.close() result = dispatcher.processinq(0) assert result['processed'] == len(input), result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == len(novelurls), result check_seenfile(seenfile)
def testRecovery(testdatadir, testdomaininfo, testmapper, testscheduler): """tests recovery run after processinq is terminated during scheduling (phase 2).""" inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) # TODO: there's another case of getting terminated during # phase 1 - actually it's more likely to happen as it takes # longer than phase 2. fortunately phase 1 recovery is simpler # than phase 2 recovery - just starting over. urls1 = generate_random_urls(50) inq.add(urls1) inq.close() seenfile = create_seen(dispatcher, []) # let TestScheduler exit on 20th (i.e. after scheduling 19) cURLs. testscheduler.failat = 20 try: dispatcher.processinq(0) assert False, 'should raise RuntimeException' except Exception as ex: # expected pass assert len(testscheduler.curis) == 19 #subprocess.call(['ls', '-l', os.path.dirname(seenfile)]) testscheduler.failat = None # enqueue another 50 URLs to verify they are not consumed by # next processinq run. urls2 = generate_random_urls(50) inq.add(urls2) dispatcher.processinq(0) # TODO: want to check all intermediate files are cleaned up? #subprocess.call(['ls', '-l', os.path.dirname(seenfile)]) n = check_seenfile(seenfile) # check: all of urls1 are now seen, none from urls2 assert n == len(urls1) # check: all of urls1 are scheduled, no duplicates assert len(testscheduler.curis) == len(urls1) scheduled_urls = [u['u'] for u in testscheduler.curis] missing = [] for u in urls1: found = (u['u'] in scheduled_urls) print >>sys.stderr, "{} {}".format(u['u'], found) if not found: missing.append(u) assert len(missing) == 0, "missing {} URLs {}".format( len(missing), missing)
def testBasic(testdatadir, testdomaininfo, testmapper, testscheduler): inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) urls = generate_random_urls(100) for url in urls: print url['u'] seenurls = urls[:50] novelurls = urls[50:] seenfile = create_seen(dispatcher, seenurls) print "processinq #1" inq.add(urls) inq.close() result = dispatcher.processinq(0) assert result['processed'] == 100, result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == 50, result scheduled = set(url['u'] for url in testscheduler.curis) assert all(url['u'] not in scheduled for url in seenurls) assert all(url['u'] in scheduled for url in novelurls) print "processinq #2" inq.add(urls) inq.close() testscheduler.curis = [] result = dispatcher.processinq(0) assert result['processed'] == 100, result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == 0, result assert len(testscheduler.curis) == 0 check_seenfile(seenfile)