Exemplo n.º 1
0
def testSameUnseenURLsInInput(testdatadir, testdomaininfo, testmapper,
                              testscheduler):
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)

    urls = generate_random_urls(100)
    seenurls = urls[:50]
    novelurls = urls[50:]
    seenfile = create_seen(dispatcher, seenurls)

    dupseenurls = [dict(url) for url in novelurls[:25]]

    input = urls + dupseenurls
    inq.add(input)
    inq.close()

    result = dispatcher.processinq(0)

    assert result['processed'] == len(input), result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == len(novelurls), result

    check_seenfile(seenfile)
Exemplo n.º 2
0
def testRecovery(testdatadir, testdomaininfo, testmapper, testscheduler):
    """tests recovery run after processinq is terminated during
    scheduling (phase 2)."""
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)
    # TODO: there's another case of getting terminated during
    # phase 1 - actually it's more likely to happen as it takes
    # longer than phase 2. fortunately phase 1 recovery is simpler
    # than phase 2 recovery - just starting over.
    urls1 = generate_random_urls(50)
    inq.add(urls1)
    inq.close()

    seenfile = create_seen(dispatcher, [])

    # let TestScheduler exit on 20th (i.e. after scheduling 19) cURLs.
    testscheduler.failat = 20
    try:
        dispatcher.processinq(0)
        assert False, 'should raise RuntimeException'
    except Exception as ex:
        # expected
        pass

    assert len(testscheduler.curis) == 19

    #subprocess.call(['ls', '-l', os.path.dirname(seenfile)])

    testscheduler.failat = None
    # enqueue another 50 URLs to verify they are not consumed by
    # next processinq run.
    urls2 = generate_random_urls(50)
    inq.add(urls2)

    dispatcher.processinq(0)

    # TODO: want to check all intermediate files are cleaned up?
    #subprocess.call(['ls', '-l', os.path.dirname(seenfile)])

    n = check_seenfile(seenfile)
    # check: all of urls1 are now seen, none from urls2
    assert n == len(urls1)
    # check: all of urls1 are scheduled, no duplicates
    assert len(testscheduler.curis) == len(urls1)
    scheduled_urls = [u['u'] for u in testscheduler.curis]
    missing = []
    for u in urls1:
        found = (u['u'] in scheduled_urls)
        print >>sys.stderr, "{} {}".format(u['u'], found)
        if not found: missing.append(u)
    assert len(missing) == 0, "missing {} URLs {}".format(
        len(missing), missing)
Exemplo n.º 3
0
def testBasic(testdatadir, testdomaininfo, testmapper, testscheduler):
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)

    urls = generate_random_urls(100)
    for url in urls:
        print url['u']

    seenurls = urls[:50]
    novelurls = urls[50:]
    seenfile = create_seen(dispatcher, seenurls)

    print "processinq #1"

    inq.add(urls)
    inq.close()

    result = dispatcher.processinq(0)

    assert result['processed'] == 100, result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == 50, result

    scheduled = set(url['u'] for url in testscheduler.curis)
    assert all(url['u'] not in scheduled for url in seenurls)
    assert all(url['u'] in scheduled for url in novelurls)

    print "processinq #2"

    inq.add(urls)
    inq.close()

    testscheduler.curis = []
    result = dispatcher.processinq(0)

    assert result['processed'] == 100, result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == 0, result

    assert len(testscheduler.curis) == 0

    check_seenfile(seenfile)