示例#1
0
def test_time_limit(httpd):
    test_id = 'test_time_limit-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

    # create a new job with one seed that could be crawled forever
    job_conf = {
        'seeds': [{
            'url': make_url(httpd, '/infinite/foo/'),
            'time_limit': 20
        }]
    }
    job = brozzler.new_job(frontier, job_conf)
    assert job.id

    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]

    # time limit should be enforced pretty soon
    start = time.time()
    while not sites[0].status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        sites[0].refresh()
    assert sites[0].status == 'FINISHED_TIME_LIMIT'

    # all sites finished so job should be finished too
    start = time.time()
    job.refresh()
    while not job.status == 'FINISHED' and time.time() - start < 10:
        time.sleep(0.5)
        job.refresh()
    assert job.status == 'FINISHED'
示例#2
0
def test_stop_crawl(httpd):
    test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

    # create a new job with three sites that could be crawled forever
    job_conf = {'seeds': [
        {'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port},
        {'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port},
        {'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id

    sites = list(frontier.job_sites(job.id))
    assert not sites[0].stop_requested
    assert not sites[1].stop_requested

    # request crawl stop for one site using the command line entrypoint
    brozzler.cli.brozzler_stop_crawl([
        'brozzler-stop-crawl', '--site=%s' % sites[0].id])
    sites[0].refresh()
    assert sites[0].stop_requested

    # stop request should be honored quickly
    start = time.time()
    while not sites[0].status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        sites[0].refresh()
    assert sites[0].status == 'FINISHED_STOP_REQUESTED'

    # but the other sites and the job as a whole should still be crawling
    sites[1].refresh()
    assert sites[1].status == 'ACTIVE'
    sites[2].refresh()
    assert sites[2].status == 'ACTIVE'
    job.refresh()
    assert job.status == 'ACTIVE'

    # request crawl stop for the job using the command line entrypoint
    brozzler.cli.brozzler_stop_crawl([
        'brozzler-stop-crawl', '--job=%s' % job.id])
    job.refresh()
    assert job.stop_requested

    # stop request should be honored quickly
    start = time.time()
    while not job.status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        job.refresh()
    assert job.status == 'FINISHED'

    # the other sites should also be FINISHED_STOP_REQUESTED
    sites[0].refresh()
    assert sites[0].status == 'FINISHED_STOP_REQUESTED'
    sites[1].refresh()
    assert sites[1].status == 'FINISHED_STOP_REQUESTED'
    sites[2].refresh()
    assert sites[2].status == 'FINISHED_STOP_REQUESTED'
示例#3
0
def test_time_limit(httpd):
    test_id = 'test_time_limit-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

    # create a new job with one seed that could be crawled forever
    job_conf = {'seeds': [{
        'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
        'time_limit': 20}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id

    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]

    # time limit should be enforced pretty soon
    start = time.time()
    while not sites[0].status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        sites[0].refresh()
    assert sites[0].status == 'FINISHED_TIME_LIMIT'

    # all sites finished so job should be finished too
    start = time.time()
    job.refresh()
    while not job.status == 'FINISHED' and time.time() - start < 10:
        time.sleep(0.5)
        job.refresh()
    assert job.status == 'FINISHED'
示例#4
0
def test_stop_crawl(httpd):
    test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

    # create a new job with three sites that could be crawled forever
    job_conf = {'seeds': [
        {'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port},
        {'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port},
        {'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id

    sites = list(frontier.job_sites(job.id))
    assert not sites[0].stop_requested
    assert not sites[1].stop_requested

    # request crawl stop for one site using the command line entrypoint
    brozzler.cli.brozzler_stop_crawl([
        'brozzler-stop-crawl', '--site=%s' % sites[0].id])
    sites[0].refresh()
    assert sites[0].stop_requested

    # stop request should be honored quickly
    start = time.time()
    while not sites[0].status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        sites[0].refresh()
    assert sites[0].status == 'FINISHED_STOP_REQUESTED'

    # but the other sites and the job as a whole should still be crawling
    sites[1].refresh()
    assert sites[1].status == 'ACTIVE'
    sites[2].refresh()
    assert sites[2].status == 'ACTIVE'
    job.refresh()
    assert job.status == 'ACTIVE'

    # request crawl stop for the job using the command line entrypoint
    brozzler.cli.brozzler_stop_crawl([
        'brozzler-stop-crawl', '--job=%s' % job.id])
    job.refresh()
    assert job.stop_requested

    # stop request should be honored quickly
    start = time.time()
    while not job.status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        job.refresh()
    assert job.status == 'FINISHED'

    # the other sites should also be FINISHED_STOP_REQUESTED
    sites[0].refresh()
    assert sites[0].status == 'FINISHED_STOP_REQUESTED'
    sites[1].refresh()
    assert sites[1].status == 'FINISHED_STOP_REQUESTED'
    sites[2].refresh()
    assert sites[2].status == 'FINISHED_STOP_REQUESTED'
示例#5
0
def test_honor_stop_request():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # 1. test stop request on job
    job_conf = {'seeds': [{'url': 'http://example.com'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    assert site.job_id == job.id

    # does not raise exception
    frontier.honor_stop_request(site)

    # set job.stop_requested
    job.stop_requested = datetime.datetime.utcnow().replace(
            tzinfo=doublethink.UTC)
    job.save()
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site)

    # 2. test stop request on site
    job_conf = {'seeds': [{'url': 'http://example.com'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    assert site.job_id == job.id

    # does not raise exception
    frontier.honor_stop_request(site)

    # set site.stop_requested
    site.stop_requested = doublethink.utcnow()
    site.save()
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site)
示例#6
0
def test_honor_stop_request():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # 1. test stop request on job
    job_conf = {'seeds': [{'url': 'http://example.com'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    assert site.job_id == job.id

    # does not raise exception
    frontier.honor_stop_request(site)

    # set job.stop_requested
    job.stop_requested = datetime.datetime.utcnow().replace(
            tzinfo=doublethink.UTC)
    job.save()
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site)

    # 2. test stop request on site
    job_conf = {'seeds': [{'url': 'http://example.com'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    assert site.job_id == job.id

    # does not raise exception
    frontier.honor_stop_request(site)

    # set site.stop_requested
    site.stop_requested = doublethink.utcnow()
    site.save()
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site)
示例#7
0
def test_max_claimed_sites():
    # max_claimed_sites is a brozzler job setting that puts a cap on the number
    # of the job's sites that can be brozzled simultaneously across the cluster
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # clean slate
    rr.table('jobs').delete().run()
    rr.table('sites').delete().run()

    job_conf = {
        'seeds': [
            {
                'url': 'http://example.com/1'
            },
            {
                'url': 'http://example.com/2'
            },
            {
                'url': 'http://example.com/3'
            },
            {
                'url': 'http://example.com/4'
            },
            {
                'url': 'http://example.com/5'
            },
        ],
        'max_claimed_sites':
        3,
    }

    job = brozzler.new_job(frontier, job_conf)

    assert job.id
    assert job.max_claimed_sites == 3

    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 5

    claimed_sites = frontier.claim_sites(1)
    assert len(claimed_sites) == 1
    claimed_sites = frontier.claim_sites(3)
    assert len(claimed_sites) == 2
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites(3)

    # clean slate for the next one
    rr.table('jobs').delete().run()
    rr.table('sites').delete().run()
示例#8
0
def test_max_claimed_sites():
    # max_claimed_sites is a brozzler job setting that puts a cap on the number
    # of the job's sites that can be brozzled simultaneously across the cluster
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # clean slate
    rr.table('jobs').delete().run()
    rr.table('sites').delete().run()

    job_conf = {
        'seeds': [
            {'url': 'http://example.com/1'},
            {'url': 'http://example.com/2'},
            {'url': 'http://example.com/3'},
            {'url': 'http://example.com/4'},
            {'url': 'http://example.com/5'},
        ],
        'max_claimed_sites': 3,
    }

    job = brozzler.new_job(frontier, job_conf)

    assert job.id
    assert job.max_claimed_sites == 3

    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 5

    claimed_sites = frontier.claim_sites(1)
    assert len(claimed_sites) == 1
    claimed_sites = frontier.claim_sites(3)
    assert len(claimed_sites) == 2
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites(3)

    # clean slate for the next one
    rr.table('jobs').delete().run()
    rr.table('sites').delete().run()
示例#9
0
def test_basics():
    rr = doublethink.Rethinker(db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    job_conf = {'seeds': [
        {'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    assert job.starts_and_stops
    assert job.starts_and_stops[0]['start']
    assert job == {
        'id': job.id,
        'conf': {
            'seeds': [
                {'url': 'http://example.com'},
                {'url': 'https://example.org/'}
            ]
        },
        'status': 'ACTIVE',
        'starts_and_stops': [
            {
                'start': job.starts_and_stops[0]['start'],
                'stop': None
            }
        ]
    }

    sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
    assert len(sites) == 2
    assert sites[0].starts_and_stops[0]['start']
    assert sites[1].starts_and_stops[0]['start']
    assert sites[0] == {
        'claimed': False,
        'id': sites[0].id,
        'job_id': job.id,
        'last_claimed': brozzler.EPOCH_UTC,
        'last_disclaimed': brozzler.EPOCH_UTC,
        'scope': {
            'surt': 'http://(com,example,)/'
        },
        'seed': 'http://example.com',
        'starts_and_stops': [
            {
                'start': sites[0].starts_and_stops[0]['start'],
                'stop': None
           }
        ],
        'status': 'ACTIVE'
    }
    assert sites[1] == {
        'claimed': False,
        'id': sites[1].id,
        'job_id': job.id,
        'last_claimed': brozzler.EPOCH_UTC,
        'last_disclaimed': brozzler.EPOCH_UTC,
        'scope': {
            'surt': 'https://(org,example,)/',
        },
        'seed': 'https://example.org/',
        'starts_and_stops': [
            {
                'start': sites[1].starts_and_stops[0]['start'],
                'stop': None,
           },
        ],
        'status': 'ACTIVE',
    }

    pages = list(frontier.site_pages(sites[0].id))
    assert len(pages) == 1
    assert pages[0] == {
        'brozzle_count': 0,
        'claimed': False,
        'hops_from_seed': 0,
        'hops_off_surt': 0,
        'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
        'job_id': job.id,
        'needs_robots_check': True,
        'priority': 1000,
        'site_id': sites[0].id,
        'url': 'http://example.com',
    }
    pages = list(frontier.site_pages(sites[1].id))
    assert len(pages) == 1
    assert pages[0] == {
        'brozzle_count': 0,
        'claimed': False,
        'hops_from_seed': 0,
        'hops_off_surt': 0,
        'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
        'job_id': job.id,
        'needs_robots_check': True,
        'priority': 1000,
        'site_id': sites[1].id,
        'url': 'https://example.org/',
    }

    # test "brozzled" parameter of frontier.site_pages
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
    pages[0].brozzle_count = 1
    pages[0].save()
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
    pages[0].brozzle_count = 32819
    pages[0].save()
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
示例#10
0
def test_resume_job():
    '''
    Tests that the right stuff gets twiddled in rethinkdb when we "start" and
    "finish" crawling a job. Doesn't actually crawl anything.
    '''
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker(db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    job_conf = {'seeds': [{'url': 'http://example.com/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert len(list(frontier.job_sites(job.id))) == 1
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop']
    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)
    job.refresh()

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop']
    assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[1]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start']

    # resuming a job == resuming all of its sites
    frontier.resume_job(job)
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop']
    assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[2]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop']
    assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[2]['start']

    frontier.resume_job(job)
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 4
    assert job.starts_and_stops[3]['start']
    assert job.starts_and_stops[3]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 4
    assert site.starts_and_stops[3]['start']
    assert site.starts_and_stops[3]['stop'] is None

    # simulate a job stop request
    job_conf = {'seeds': [{'url': 'http://example.com/'}, {'url': 'http://example_2.com/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert len(list(frontier.job_sites(job.id))) == 2
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
    job.save()

    # should raise a CrawlStopped
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site1)

    frontier.finished(site1, 'FINISHED_STOP_REQUESTED')
    frontier.finished(site2, 'FINISHED_STOP_REQUESTED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert job.stop_requested
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop']
    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
    assert site1.status == 'FINISHED_STOP_REQUESTED'
    assert site2.status == 'FINISHED_STOP_REQUESTED'
    assert len(site1.starts_and_stops) == 1
    assert len(site2.starts_and_stops) == 1
    assert site1.starts_and_stops[0]['start']
    assert site1.starts_and_stops[0]['stop']
    assert site1.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
    assert site2.starts_and_stops[0]['start']
    assert site2.starts_and_stops[0]['stop']
    assert site2.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    # simulate job resume after a stop request
    frontier.resume_job(job)
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'ACTIVE'
    assert len(site1.starts_and_stops) == 2
    assert site1.starts_and_stops[1]['start']
    assert site1.starts_and_stops[1]['stop'] is None
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None

    # simulate a site stop request
    site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
    site1.save()

    # should not raise a CrawlStopped
    frontier.honor_stop_request(site2)

    frontier.finished(site1, 'FINISHED_STOP_REQUESTED')
    job.refresh()

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'FINISHED_STOP_REQUESTED'
    assert len(site1.starts_and_stops) == 2
    assert site1.starts_and_stops[1]['start']
    assert site1.starts_and_stops[1]['stop']
    assert site1.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start']
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None

    # simulate site resume after a stop request
    frontier.resume_site(site1)
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'ACTIVE'
    assert site1.stop_requested is None
    assert len(site1.starts_and_stops) == 3
    assert site1.starts_and_stops[2]['start']
    assert site1.starts_and_stops[2]['stop'] is None
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None
示例#11
0
def test_resume_job():
    '''
    Tests that the right stuff gets twiddled in rethinkdb when we "start" and
    "finish" crawling a job. Doesn't actually crawl anything.
    '''
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker(db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    job_conf = {'seeds': [{'url': 'http://example.com/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert len(list(frontier.job_sites(job.id))) == 1
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop']
    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)
    job.refresh()

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop']
    assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']

    # resuming a job == resuming all of its sites
    frontier.resume_job(job)
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop']
    assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop']
    assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
示例#12
0
def test_basics():
    rr = doublethink.Rethinker(db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    job_conf = {'seeds': [
        {'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    assert job.starts_and_stops
    assert job.starts_and_stops[0]['start']
    assert job == {
        'id': job.id,
        'conf': {
            'seeds': [
                {'url': 'http://example.com'},
                {'url': 'https://example.org/'}
            ]
        },
        'status': 'ACTIVE',
        'starts_and_stops': [
            {
                'start': job.starts_and_stops[0]['start'],
                'stop': None
            }
        ]
    }

    sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
    assert len(sites) == 2
    assert sites[0].starts_and_stops[0]['start']
    assert sites[1].starts_and_stops[0]['start']
    assert sites[0] == {
        'claimed': False,
        'id': sites[0].id,
        'job_id': job.id,
        'last_claimed': brozzler.EPOCH_UTC,
        'last_disclaimed': brozzler.EPOCH_UTC,
        'scope': {'accepts': [{'ssurt': 'com,example,//http:/'}]},
        'seed': 'http://example.com',
        'starts_and_stops': [
            {
                'start': sites[0].starts_and_stops[0]['start'],
                'stop': None
           }
        ],
        'status': 'ACTIVE'
    }
    assert sites[1] == {
        'claimed': False,
        'id': sites[1].id,
        'job_id': job.id,
        'last_claimed': brozzler.EPOCH_UTC,
        'last_disclaimed': brozzler.EPOCH_UTC,
        'scope': {'accepts': [{'ssurt': 'org,example,//https:/'}]},
        'seed': 'https://example.org/',
        'starts_and_stops': [
            {
                'start': sites[1].starts_and_stops[0]['start'],
                'stop': None,
           },
        ],
        'status': 'ACTIVE',
    }

    pages = list(frontier.site_pages(sites[0].id))
    assert len(pages) == 1
    assert pages[0] == {
        'brozzle_count': 0,
        'claimed': False,
        'hops_from_seed': 0,
        'hops_off': 0,
        'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
        'job_id': job.id,
        'needs_robots_check': True,
        'priority': 1000,
        'site_id': sites[0].id,
        'url': 'http://example.com',
    }
    pages = list(frontier.site_pages(sites[1].id))
    assert len(pages) == 1
    assert pages[0] == {
        'brozzle_count': 0,
        'claimed': False,
        'hops_from_seed': 0,
        'hops_off': 0,
        'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
        'job_id': job.id,
        'needs_robots_check': True,
        'priority': 1000,
        'site_id': sites[1].id,
        'url': 'https://example.org/',
    }

    # test "brozzled" parameter of frontier.site_pages
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
    pages[0].brozzle_count = 1
    pages[0].save()
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
    pages[0].brozzle_count = 32819
    pages[0].save()
    assert len(list(frontier.site_pages(sites[1].id))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
    assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
示例#13
0
def test_resume_job():
    '''
    Tests that the right stuff gets twiddled in rethinkdb when we "start" and
    "finish" crawling a job. Doesn't actually crawl anything.
    '''
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker(db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    job_conf = {'seeds': [{'url': 'http://example.com/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert len(list(frontier.job_sites(job.id))) == 1
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop']
    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)
    job.refresh()

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop']
    assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[1]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start']

    # resuming a job == resuming all of its sites
    frontier.resume_job(job)
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop'] is None

    frontier.finished(site, 'FINISHED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert len(job.starts_and_stops) == 3
    assert job.starts_and_stops[2]['start']
    assert job.starts_and_stops[2]['stop']
    assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[2]['start']
    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 3
    assert site.starts_and_stops[2]['start']
    assert site.starts_and_stops[2]['stop']
    assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[2]['start']

    frontier.resume_job(job)
    site = list(frontier.job_sites(job.id))[0]

    assert job.status == 'ACTIVE'
    assert len(job.starts_and_stops) == 4
    assert job.starts_and_stops[3]['start']
    assert job.starts_and_stops[3]['stop'] is None
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 4
    assert site.starts_and_stops[3]['start']
    assert site.starts_and_stops[3]['stop'] is None

    # simulate a job stop request
    job_conf = {'seeds': [{'url': 'http://example.com/'}, {'url': 'http://example_2.com/'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert len(list(frontier.job_sites(job.id))) == 2
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
    job.save()

    # should raise a CrawlStopped
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site1)

    frontier.finished(site1, 'FINISHED_STOP_REQUESTED')
    frontier.finished(site2, 'FINISHED_STOP_REQUESTED')
    job.refresh()

    assert job.status == 'FINISHED'
    assert job.stop_requested
    assert len(job.starts_and_stops) == 1
    assert job.starts_and_stops[0]['start']
    assert job.starts_and_stops[0]['stop']
    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
    assert site1.status == 'FINISHED_STOP_REQUESTED'
    assert site2.status == 'FINISHED_STOP_REQUESTED'
    assert len(site1.starts_and_stops) == 1
    assert len(site2.starts_and_stops) == 1
    assert site1.starts_and_stops[0]['start']
    assert site1.starts_and_stops[0]['stop']
    assert site1.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
    assert site2.starts_and_stops[0]['start']
    assert site2.starts_and_stops[0]['stop']
    assert site2.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    # simulate job resume after a stop request
    frontier.resume_job(job)
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'ACTIVE'
    assert len(site1.starts_and_stops) == 2
    assert site1.starts_and_stops[1]['start']
    assert site1.starts_and_stops[1]['stop'] is None
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None

    # simulate a site stop request
    site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
    site1.save()

    # should not raise a CrawlStopped
    frontier.honor_stop_request(site2)

    frontier.finished(site1, 'FINISHED_STOP_REQUESTED')
    job.refresh()

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'FINISHED_STOP_REQUESTED'
    assert len(site1.starts_and_stops) == 2
    assert site1.starts_and_stops[1]['start']
    assert site1.starts_and_stops[1]['stop']
    assert site1.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start']
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None

    # simulate site resume after a stop request
    frontier.resume_site(site1)
    site1 = list(frontier.job_sites(job.id))[0]
    site2 = list(frontier.job_sites(job.id))[1]

    assert job.status == 'ACTIVE'
    assert job.stop_requested is None
    assert len(job.starts_and_stops) == 2
    assert job.starts_and_stops[1]['start']
    assert job.starts_and_stops[1]['stop'] is None
    assert site1.status == 'ACTIVE'
    assert site1.stop_requested is None
    assert len(site1.starts_and_stops) == 3
    assert site1.starts_and_stops[2]['start']
    assert site1.starts_and_stops[2]['stop'] is None
    assert site2.status == 'ACTIVE'
    assert len(site2.starts_and_stops) == 2
    assert site2.starts_and_stops[1]['start']
    assert site2.starts_and_stops[1]['stop'] is None