Пример #1
0
def test_provision_with_schema(segment_manager_server):
    schema = '''CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4));
INSERT INTO test (test) VALUES ("test");'''
    # create a schema by submitting sql
    result = segment_manager_server.put('/schema/test1/sql',
                                        content_type='applicaton/sql',
                                        data=schema)
    assert result.status_code == 201

    # provision a segment with that schema
    result = segment_manager_server.post('/provision',
                                         content_type='application/json',
                                         data=ujson.dumps({
                                             'segment':
                                             'test_provision_with_schema_1',
                                             'schema': 'test1'
                                         }))
    assert result.status_code == 200
    assert result.mimetype == 'application/json'
    result_bytes = b''.join(result.response)
    result_dict = ujson.loads(result_bytes)  # ujson accepts bytes! 😻
    assert result_dict['write_url'].endswith(
        ':6222/?segment=test_provision_with_schema_1')

    # get db read url from rethinkdb
    rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'],
                                      db='trough_configuration')
    query = rethinker.table('services').get_all(
        'test_provision_with_schema_1', index='segment').filter({
            'role':
            'trough-read'
        }).filter(lambda svc: r.now().sub(svc['last_heartbeat']).lt(svc['ttl'])
                  ).order_by('load')[0]
    healthy_segment = query.run()
    read_url = healthy_segment.get('url')
    assert read_url.endswith(':6444/?segment=test_provision_with_schema_1')

    # run a query to check that the schema was used
    sql = 'SELECT * FROM test;'
    with requests.post(read_url, stream=True, data=sql) as response:
        assert response.status_code == 200
        result = ujson.loads(response.text)
        assert result == [{'test': 'test', 'id': 1}]

    # delete the schema from rethinkdb for the sake of other tests
    rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'],
                                      db='trough_configuration')
    result = rethinker.table('schema').get('test1').delete().run()
    assert result == {
        'deleted': 1,
        'inserted': 0,
        'skipped': 0,
        'errors': 0,
        'unchanged': 0,
        'replaced': 0
    }
Пример #2
0
    def test_proxy_for_write_segment(self, requests):
        def post(*args, **kwargs):
            response = mock.Mock()
            response.headers = {"Content-Type": "application/json"}
            response.iter_content = lambda: (b"test", b"output")
            response.status_code = 200
            response.__enter__ = lambda *args, **kwargs: response
            response.__exit__ = lambda *args, **kwargs: None
            return response

        requests.post = post
        consul = mock.Mock()
        registry = mock.Mock()
        rethinker = doublethink.Rethinker(db="trough_configuration",
                                          servers=settings['RETHINKDB_HOSTS'])
        services = doublethink.ServiceRegistry(rethinker)
        segment = trough.sync.Segment(segment_id="TEST",
                                      rethinker=rethinker,
                                      services=services,
                                      registry=registry,
                                      size=0)
        output = self.server.proxy_for_write_host(
            'localhost',
            segment,
            "SELECT * FROM mock;",
            start_response=lambda *args, **kwargs: None)
        self.assertEqual(list(output), [b"test", b"output"])
Пример #3
0
def test_stop_crawl(httpd):
    test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

    # create a new job with three sites that could be crawled forever
    job_conf = {'seeds': [
        {'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port},
        {'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port},
        {'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id

    sites = list(frontier.job_sites(job.id))
    assert not sites[0].stop_requested
    assert not sites[1].stop_requested

    # request crawl stop for one site using the command line entrypoint
    brozzler.cli.brozzler_stop_crawl([
        'brozzler-stop-crawl', '--site=%s' % sites[0].id])
    sites[0].refresh()
    assert sites[0].stop_requested

    # stop request should be honored quickly
    start = time.time()
    while not sites[0].status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        sites[0].refresh()
    assert sites[0].status == 'FINISHED_STOP_REQUESTED'

    # but the other sites and the job as a whole should still be crawling
    sites[1].refresh()
    assert sites[1].status == 'ACTIVE'
    sites[2].refresh()
    assert sites[2].status == 'ACTIVE'
    job.refresh()
    assert job.status == 'ACTIVE'

    # request crawl stop for the job using the command line entrypoint
    brozzler.cli.brozzler_stop_crawl([
        'brozzler-stop-crawl', '--job=%s' % job.id])
    job.refresh()
    assert job.stop_requested

    # stop request should be honored quickly
    start = time.time()
    while not job.status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        job.refresh()
    assert job.status == 'FINISHED'

    # the other sites should also be FINISHED_STOP_REQUESTED
    sites[0].refresh()
    assert sites[0].status == 'FINISHED_STOP_REQUESTED'
    sites[1].refresh()
    assert sites[1].status == 'FINISHED_STOP_REQUESTED'
    sites[2].refresh()
    assert sites[2].status == 'FINISHED_STOP_REQUESTED'
Пример #4
0
 def _do_read(self, query, raw=False):
     # send query to server, return JSON
     rethinker = doublethink.Rethinker(db="trough_configuration",
                                       servers=self.rethinkdb)
     healthy_databases = list(
         rethinker.table('services').get_all(self.database,
                                             index='segment').run())
     healthy_databases = [
         db for db in healthy_databases if db['role'] == 'trough-read' and
         (rethinker.now().run() - db['last_heartbeat']).seconds < db['ttl']
     ]
     try:
         assert len(healthy_databases) > 0
     except:
         raise Exception('No healthy node found for segment %s' %
                         self.database)
     url = urlparse(healthy_databases[0].get('url'))
     if self.proxy:
         conn = HTTPConnection(self.proxy, self.proxy_port)
         conn.set_tunnel(url.netloc, url.port)
         conn.sock = socks.socksocket()
         conn.sock.set_proxy(self.proxy_type, self.proxy, self.proxy_port)
         conn.sock.connect((url.netloc.split(":")[0], url.port))
     else:
         conn = HTTPConnection(url.netloc)
     request_path = "%s?%s" % (url.path, url.query)
     conn.request("POST", request_path, query)
     response = conn.getresponse()
     results = json.loads(response.read())
     self._last_results = results
Пример #5
0
 def __init__(self, options=warcprox.Options()):
     parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url)
     self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                     db=parsed.database)
     self.table = parsed.table
     self._ensure_db_table()
     self.options = options
Пример #6
0
 def __init__(self):
     self.rethinker = doublethink.Rethinker(
         db="trough_configuration", servers=settings['RETHINKDB_HOSTS'])
     self.services = doublethink.ServiceRegistry(self.rethinker)
     self.registry = trough.sync.HostRegistry(rethinker=self.rethinker,
                                              services=self.services)
     trough.sync.init(self.rethinker)
Пример #7
0
def test_hashtag_seed():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # no hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    assert site.scope['surt'] == 'http://(org,example,)/'

    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert not pages[0].hashtags

    # yes hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
    brozzler.new_site(frontier, site)

    assert site.scope['surt'] == 'http://(org,example,)/'

    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert pages[0].hashtags == ['#hash',]
Пример #8
0
    def __init__(self, rethinkdb_trough_db_url, promotion_interval=None):
        '''
        TroughClient constructor

        Args:
            rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to
                trough configuration database
            promotion_interval: if specified, `TroughClient` will spawn a
                thread that "promotes" (pushed to hdfs) "dirty" trough segments
                (segments that have received writes) periodically, sleeping for
                `promotion_interval` seconds between cycles (default None)
        '''
        parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.svcreg = doublethink.ServiceRegistry(self.rr)
        self._write_url_cache = {}
        self._read_url_cache = {}
        self._dirty_segments = set()
        self._dirty_segments_lock = threading.RLock()

        self.promotion_interval = promotion_interval
        self._promoter_thread = None
        if promotion_interval:
            self._promoter_thread = threading.Thread(
                target=self._promotrix, name='TroughClient-promoter')
            self._promoter_thread.setDaemon(True)
            self._promoter_thread.start()
Пример #9
0
def test_seed_redirect(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the pages table
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    pages.sort(key=lambda page: page.hops_from_seed)
    assert pages[0].hops_from_seed == 0
    assert pages[0].url == seed_url
    assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port

    # check that scope has been updated properly
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
Пример #10
0
def test_time_limit(httpd):
    test_id = 'test_time_limit-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

    # create a new job with one seed that could be crawled forever
    job_conf = {
        'seeds': [{
            'url': make_url(httpd, '/infinite/foo/'),
            'time_limit': 20
        }]
    }
    job = brozzler.new_job(frontier, job_conf)
    assert job.id

    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]

    # time limit should be enforced pretty soon
    start = time.time()
    while not sites[0].status.startswith(
            'FINISHED') and time.time() - start < 120:
        time.sleep(0.5)
        sites[0].refresh()
    assert sites[0].status == 'FINISHED_TIME_LIMIT'

    # all sites finished so job should be finished too
    start = time.time()
    job.refresh()
    while not job.status == 'FINISHED' and time.time() - start < 10:
        time.sleep(0.5)
        job.refresh()
    assert job.status == 'FINISHED'
Пример #11
0
def test_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
    site = brozzler.Site(
        rr, {
            'seed': seed_url,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == [
        'http://localhost:%s/site7/foo.html' % httpd.server_port
    ]
    assert not pages[0].hashtags
    assert pages[
        1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == [
        '#boosh',
        '#ignored',
        '#whee',
    ]

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id': test_id}).run()
    captures_by_url = {
        c['url']: c
        for c in captures if c['http_method'] != 'HEAD'
    }
    assert seed_url in captures_by_url
    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
    assert 'screenshot:%s' % seed_url in captures_by_url
    assert 'thumbnail:%s' % seed_url in captures_by_url
    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
Пример #12
0
def test_seed_page():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    site = brozzler.Site(rr, {'seed': 'http://example.com/a/'})
    site.save()

    assert frontier.seed_page(site.id) is None

    page1 = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/b/',
        'hops_from_seed': 1
    })
    page1.save()

    assert frontier.seed_page(site.id) is None

    page0 = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/',
        'hops_from_seed': 0
    })
    page0.save()

    assert frontier.seed_page(site.id) == page0
Пример #13
0
def test_obey_robots(httpd):
    test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(
        rr,
        {
            'seed': make_url(httpd, '/site1/'),
            'user_agent': 'im a badbot',  # robots.txt blocks badbot
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    # so we can examine rethinkdb before it does anything
    try:
        stop_service('brozzler-worker')

        assert site.id is None
        frontier = brozzler.RethinkDbFrontier(rr)
        brozzler.new_site(frontier, site)
        assert site.id is not None
        site_pages = list(frontier.site_pages(site.id))
        assert len(site_pages) == 1
        assert site_pages[0].url == site.seed
        assert site_pages[0].needs_robots_check
    finally:
        start_service('brozzler-worker')

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that only the one page is in rethinkdb
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    page = pages[0]
    assert page.url == make_url(httpd, '/site1/')
    assert page.blocked_by_robots

    # take a look at the captures table
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    robots_url = make_url(httpd, '/robots.txt')
    captures = list(rr.table('captures').filter({'test_id': test_id}).run())
    assert len(captures) == 1
    assert captures[0]['url'] == robots_url

    # check pywb
    t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
    expected_payload = open(
        os.path.join(os.path.dirname(__file__), 'htdocs', 'robots.txt'),
        'rb').read()
    assert requests.get(wb_url,
                        allow_redirects=False).content == expected_payload
Пример #14
0
 def service_registry(options):
     if options.rethinkdb_services_url:
         parsed = doublethink.parse_rethinkdb_url(
                 options.rethinkdb_services_url)
         rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
         return doublethink.ServiceRegistry(rr, table=parsed.table)
     else:
         return None
Пример #15
0
 def setUp(self):
     self.rethinker = doublethink.Rethinker(
         db=random_db, servers=settings['RETHINKDB_HOSTS'])
     self.services = doublethink.ServiceRegistry(self.rethinker)
     self.registry = sync.HostRegistry(rethinker=self.rethinker,
                                       services=self.services)
     self.snakebite_client = mock.Mock()
     self.rethinker.table("services").delete().run()
Пример #16
0
 def setUp(self):
     self.rethinker = doublethink.Rethinker(
         db=random_db, servers=settings['RETHINKDB_HOSTS'])
     self.services = doublethink.ServiceRegistry(self.rethinker)
     sync.init(self.rethinker)
     self.rethinker.table("services").delete().run()
     self.rethinker.table("lock").delete().run()
     self.rethinker.table("assignment").delete().run()
Пример #17
0
    def __init__(self, options=warcprox.Options()):
        StatsProcessor.__init__(self, options)

        parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.table = parsed.table
        self.replicas = min(3, len(self.rr.servers))
Пример #18
0
def test_login(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(
        rr, {
            'seed': 'http://localhost:%s/site2/' % httpd.server_port,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            },
            'username': '******',
            'password': '******'
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the captures table
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
    captures = list(
        rr.table('captures').filter({
            'test_id': test_id
        }).order_by('timestamp').run())
    meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]

    # there are several forms in in htdocs/site2/login.html but only one
    # that brozzler's heuristic should match and try to submit, and it has
    # action='00', so we can check for that here
    assert ('POST http://localhost:%s/site2/00' %
            httpd.server_port) in meth_url

    # sanity check the rest of the crawl
    assert ('GET http://localhost:%s/robots.txt' %
            httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' %
            httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' %
            httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/login.html' %
            httpd.server_port) in meth_url
    assert (
        'WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html'
        % httpd.server_port) in meth_url
    assert (
        'WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html'
        % httpd.server_port) in meth_url
Пример #19
0
 def _init_brozzler_worker(self, args):
     rr = doublethink.Rethinker(
             args.rethinkdb_servers.split(","), args.rethinkdb_db)
     frontier = brozzler.RethinkDbFrontier(rr)
     service_registry = doublethink.ServiceRegistry(rr)
     worker = brozzler.worker.BrozzlerWorker(
             frontier, service_registry, chrome_exe=args.chrome_exe,
             proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
             max_browsers=args.max_browsers)
     return worker
Пример #20
0
def test_time_limit():
    # XXX test not thoroughly adapted to change in time accounting, since
    # starts_and_stops is no longer used to enforce time limits

    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/',
        'time_limit': 99999
    })
    brozzler.new_site(frontier, site)

    site.refresh()  # get it back from the db
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')

    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    # no time limit set
    frontier.enforce_time_limit(site)

    site.time_limit = 10
    site.claimed = True
    site.save()

    # time limit not reached yet
    frontier.enforce_time_limit(site)
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.time_limit = 0.1
    time.sleep(0.1)

    with pytest.raises(brozzler.ReachedTimeLimit):
        frontier.enforce_time_limit(site)
Пример #21
0
def test_time_limit():
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/',
        'time_limit': 99999
    })
    brozzler.new_site(frontier, site)

    site.refresh()  # get it back from the db
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')

    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    # time limit not reached yet
    frontier._enforce_time_limit(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.time_limit = 0.1
    site.claimed = True
    site.save()

    time.sleep(0.1)
    frontier._enforce_time_limit(site)

    assert site.status == 'FINISHED_TIME_LIMIT'
    assert not site.claimed
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
Пример #22
0
def test_redirect_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = make_url(httpd, '/site9/')
    site = brozzler.Site(
        rr, {
            'seed': seed_url,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == [
        make_url(httpd, '/site9/redirect.html')
    ]
    assert not pages[0].hashtags
    assert pages[1].url == make_url(httpd, '/site9/redirect.html')
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == [
        '#hash1',
        '#hash2',
    ]

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id': test_id}).run()
    redirect_captures = [
        c for c in captures
        if c['url'] == make_url(httpd, '/site9/redirect.html')
        and c['http_method'] == 'GET'
    ]
    assert len(redirect_captures) == 2  # youtube-dl + browser, no hashtags
Пример #23
0
def test_max_claimed_sites():
    # max_claimed_sites is a brozzler job setting that puts a cap on the number
    # of the job's sites that can be brozzled simultaneously across the cluster
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # clean slate
    rr.table('jobs').delete().run()
    rr.table('sites').delete().run()

    job_conf = {
        'seeds': [
            {
                'url': 'http://example.com/1'
            },
            {
                'url': 'http://example.com/2'
            },
            {
                'url': 'http://example.com/3'
            },
            {
                'url': 'http://example.com/4'
            },
            {
                'url': 'http://example.com/5'
            },
        ],
        'max_claimed_sites':
        3,
    }

    job = brozzler.new_job(frontier, job_conf)

    assert job.id
    assert job.max_claimed_sites == 3

    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 5

    claimed_sites = frontier.claim_sites(1)
    assert len(claimed_sites) == 1
    claimed_sites = frontier.claim_sites(3)
    assert len(claimed_sites) == 2
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites(3)

    # clean slate for the next one
    rr.table('jobs').delete().run()
    rr.table('sites').delete().run()
Пример #24
0
    def __init__(self, options=warcprox.Options()):
        parsed = doublethink.parse_rethinkdb_url(
            options.rethinkdb_big_table_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.table = parsed.table
        self.options = options
        self._ensure_db_table()

        self._stop = threading.Event()
        self._batch_lock = threading.RLock()
        with self._batch_lock:
            self._batch = []
        self._timer = None
Пример #25
0
def test_ydl_stitching(httpd):
    test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(
        rr, {
            'seed': make_url(httpd, '/site10/'),
            'warcprox_meta': {
                'warc-prefix': 'test_ydl_stitching',
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check page.videos
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    page = pages[0]
    while time.time() - start < 600 and not page.videos:
        time.sleep(0.5)
        page.refresh()
    assert len(page.videos) == 6
    stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/')
    assert {
        'blame': 'youtube-dl',
        'content-length': 267900,
        'content-type': 'video/mp4',
        'response_code': 204,
        'url': stitched_url,
    } in page.videos

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = list(rr.table('captures').filter({'test_id': test_id}).run())
    l = [c for c in captures if c['url'] == stitched_url]
    assert len(l) == 1
    c = l[0]
    assert c['filename'].startswith('test_ydl_stitching')
    assert c['content_type'] == 'video/mp4'
    assert c['http_method'] == 'WARCPROX_WRITE_RECORD'
def test_combos():
    rr = doublethink.Rethinker(['127.0.0.1'])

    try:
        warcprox.main.ensure_rethinkdb_tables([
            'warcprox-ensure-rethinkdb-tables',
            '--rethinkdb-stats-url=rethinkdb://127.0.0.1/db00/stats',
            '--rethinkdb-trough-db-url=rethinkdb://127.0.0.1/db01',
        ])
        assert rr.db('db00').table_list().run() == ['stats']
        assert rr.db('db01').table_list().run() == ['services']
        # ['assignment', 'lock', 'schema', 'services']
    finally:
        rr.db_drop('db00').run()
        rr.db_drop('db01').run()
def test_individual_options():
    rr = doublethink.Rethinker(['127.0.0.1'])

    try:
        warcprox.main.ensure_rethinkdb_tables([
            'warcprox-ensure-rethinkdb-tables',
            '--rethinkdb-stats-url=rethinkdb://127.0.0.1/db0/stats'
        ])
        assert rr.db('db0').table_list().run() == ['stats']
    finally:
        rr.db_drop('db0').run()

    try:
        warcprox.main.ensure_rethinkdb_tables([
            'warcprox-ensure-rethinkdb-tables',
            '--rethinkdb-services-url=rethinkdb://127.0.0.1/db1/services'
        ])
        assert rr.db('db1').table_list().run() == ['services']
    finally:
        rr.db_drop('db1').run()

    try:
        warcprox.main.ensure_rethinkdb_tables([
            'warcprox-ensure-rethinkdb-tables',
            '--rethinkdb-dedup-url=rethinkdb://127.0.0.1/db2/dedup'
        ])
        assert rr.db('db2').table_list().run() == ['dedup']
    finally:
        rr.db_drop('db2').run()

    try:
        warcprox.main.ensure_rethinkdb_tables([
            'warcprox-ensure-rethinkdb-tables',
            '--rethinkdb-big-table-url=rethinkdb://127.0.0.1/db3/captures'
        ])
        assert rr.db('db3').table_list().run() == ['captures']
    finally:
        rr.db_drop('db3').run()

    try:
        warcprox.main.ensure_rethinkdb_tables([
            'warcprox-ensure-rethinkdb-tables',
            '--rethinkdb-trough-db-url=rethinkdb://127.0.0.1/db4'
        ])
        assert rr.db('db4').table_list().run() == ['services']
        # ['assignment', 'lock', 'schema', 'services']
    finally:
        rr.db_drop('db4').run()
Пример #28
0
 def _do_write(self, query):
     # send provision query to server if not self._write_url.
     # after send provision query, set self._write_url.
     # send query to server, return JSON
     rethinker = doublethink.Rethinker(db="trough_configuration",
                                       servers=self.rethinkdb)
     services = doublethink.ServiceRegistry(rethinker)
     master_node = services.unique_service('trough-sync-master')
     logging.info('master_node=%r', master_node)
     if not master_node:
         raise Exception(
             'no healthy trough-sync-master in service registry')
     if not self._write_url:
         buffer = BytesIO()
         c = pycurl.Curl()
         c.setopt(c.URL, master_node.get('url'))
         c.setopt(c.POSTFIELDS, self.database)
         if self.proxy:
             c.setopt(pycurl.PROXY, self.proxy)
             c.setopt(pycurl.PROXYPORT, int(self.proxy_port))
             c.setopt(pycurl.PROXYTYPE, self.proxy_type)
         c.setopt(c.WRITEDATA, buffer)
         c.perform()
         c.close()
         self._write_url = buffer.getvalue()
         logging.info('self._write_url=%r', self._write_url)
     buffer = BytesIO()
     c = pycurl.Curl()
     c.setopt(c.URL, self._write_url)
     c.setopt(c.POSTFIELDS, query)
     if self.proxy:
         c.setopt(pycurl.PROXY, self.proxy)
         c.setopt(pycurl.PROXYPORT, int(self.proxy_port))
         c.setopt(pycurl.PROXYTYPE, self.proxy_type)
     c.setopt(c.WRITEDATA, buffer)
     c.perform()
     c.close()
     response = buffer.getvalue()
     if response.strip() != b'OK':
         raise Exception(
             'Trough Query Failed: Database: %r Response: %r Query: %.200r'
             % (self.database, response, query))
     self._last_results = None
Пример #29
0
def test_claim_site():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    rr.table('sites').delete().run()  # clean slate

    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id
    assert claimed_site.claimed
    assert claimed_site.last_claimed >= doublethink.utcnow(
    ) - datetime.timedelta(minutes=1)
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed less than 1 hour ago still not to be reclaimed
    claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(
        minutes=55)
    claimed_site.save()
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed more than 1 hour ago can be reclaimed
    site = claimed_site
    claimed_site = None
    site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
    site.save()
    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id

    # clean up
    rr.table('sites').get(claimed_site.id).delete().run()
Пример #30
0
def test_scope_and_schedule_outlinks():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {'seed': 'http://example.com/'})
    parent_page = brozzler.Page(rr, {
        'hops_from_seed': 1,
        'url': 'http://example.com/whatever'
    })
    outlinks = [
        'https://example.com/',
        'https://example.com/foo',
        'http://example.com/bar',
        'HTtp://exAMPle.COm/bar',
        'HTtp://exAMPle.COm/BAr',
        'HTtp://exAMPle.COm/BAZZZZ',
    ]
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots

    assert sorted(parent_page.outlinks['rejected']) == [
        'https://example.com/', 'https://example.com/foo'
    ]
    assert sorted(parent_page.outlinks['accepted']) == [
        'http://example.com/BAZZZZ', 'http://example.com/BAr',
        'http://example.com/bar'
    ]
    assert parent_page.outlinks['blocked'] == []

    pp = brozzler.Page.load(rr, parent_page.id)
    assert pp == parent_page

    for url in parent_page.outlinks['rejected']:
        id = brozzler.Page.compute_id(site.id, url)
        assert brozzler.Page.load(rr, id) is None
    for url in parent_page.outlinks['accepted']:
        id = brozzler.Page.compute_id(site.id, url)
        assert brozzler.Page.load(rr, id)