def test_provision_with_schema(segment_manager_server): schema = '''CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4)); INSERT INTO test (test) VALUES ("test");''' # create a schema by submitting sql result = segment_manager_server.put('/schema/test1/sql', content_type='applicaton/sql', data=schema) assert result.status_code == 201 # provision a segment with that schema result = segment_manager_server.post('/provision', content_type='application/json', data=ujson.dumps({ 'segment': 'test_provision_with_schema_1', 'schema': 'test1' })) assert result.status_code == 200 assert result.mimetype == 'application/json' result_bytes = b''.join(result.response) result_dict = ujson.loads(result_bytes) # ujson accepts bytes! 😻 assert result_dict['write_url'].endswith( ':6222/?segment=test_provision_with_schema_1') # get db read url from rethinkdb rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'], db='trough_configuration') query = rethinker.table('services').get_all( 'test_provision_with_schema_1', index='segment').filter({ 'role': 'trough-read' }).filter(lambda svc: r.now().sub(svc['last_heartbeat']).lt(svc['ttl']) ).order_by('load')[0] healthy_segment = query.run() read_url = healthy_segment.get('url') assert read_url.endswith(':6444/?segment=test_provision_with_schema_1') # run a query to check that the schema was used sql = 'SELECT * FROM test;' with requests.post(read_url, stream=True, data=sql) as response: assert response.status_code == 200 result = ujson.loads(response.text) assert result == [{'test': 'test', 'id': 1}] # delete the schema from rethinkdb for the sake of other tests rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'], db='trough_configuration') result = rethinker.table('schema').get('test1').delete().run() assert result == { 'deleted': 1, 'inserted': 0, 'skipped': 0, 'errors': 0, 'unchanged': 0, 'replaced': 0 }
def test_proxy_for_write_segment(self, requests): def post(*args, **kwargs): response = mock.Mock() response.headers = {"Content-Type": "application/json"} response.iter_content = lambda: (b"test", b"output") response.status_code = 200 response.__enter__ = lambda *args, **kwargs: response response.__exit__ = lambda *args, **kwargs: None return response requests.post = post consul = mock.Mock() registry = mock.Mock() rethinker = doublethink.Rethinker(db="trough_configuration", servers=settings['RETHINKDB_HOSTS']) services = doublethink.ServiceRegistry(rethinker) segment = trough.sync.Segment(segment_id="TEST", rethinker=rethinker, services=services, registry=registry, size=0) output = self.server.proxy_for_write_host( 'localhost', segment, "SELECT * FROM mock;", start_response=lambda *args, **kwargs: None) self.assertEqual(list(output), [b"test", b"output"])
def test_stop_crawl(httpd): test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) # create a new job with three sites that could be crawled forever job_conf = {'seeds': [ {'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port}, {'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port}, {'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert not sites[0].stop_requested assert not sites[1].stop_requested # request crawl stop for one site using the command line entrypoint brozzler.cli.brozzler_stop_crawl([ 'brozzler-stop-crawl', '--site=%s' % sites[0].id]) sites[0].refresh() assert sites[0].stop_requested # stop request should be honored quickly start = time.time() while not sites[0].status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) sites[0].refresh() assert sites[0].status == 'FINISHED_STOP_REQUESTED' # but the other sites and the job as a whole should still be crawling sites[1].refresh() assert sites[1].status == 'ACTIVE' sites[2].refresh() assert sites[2].status == 'ACTIVE' job.refresh() assert job.status == 'ACTIVE' # request crawl stop for the job using the command line entrypoint brozzler.cli.brozzler_stop_crawl([ 'brozzler-stop-crawl', '--job=%s' % job.id]) job.refresh() assert job.stop_requested # stop request should be honored quickly start = time.time() while not job.status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) job.refresh() assert job.status == 'FINISHED' # the other sites should also be FINISHED_STOP_REQUESTED sites[0].refresh() assert sites[0].status == 'FINISHED_STOP_REQUESTED' sites[1].refresh() assert sites[1].status == 'FINISHED_STOP_REQUESTED' sites[2].refresh() assert sites[2].status == 'FINISHED_STOP_REQUESTED'
def _do_read(self, query, raw=False): # send query to server, return JSON rethinker = doublethink.Rethinker(db="trough_configuration", servers=self.rethinkdb) healthy_databases = list( rethinker.table('services').get_all(self.database, index='segment').run()) healthy_databases = [ db for db in healthy_databases if db['role'] == 'trough-read' and (rethinker.now().run() - db['last_heartbeat']).seconds < db['ttl'] ] try: assert len(healthy_databases) > 0 except: raise Exception('No healthy node found for segment %s' % self.database) url = urlparse(healthy_databases[0].get('url')) if self.proxy: conn = HTTPConnection(self.proxy, self.proxy_port) conn.set_tunnel(url.netloc, url.port) conn.sock = socks.socksocket() conn.sock.set_proxy(self.proxy_type, self.proxy, self.proxy_port) conn.sock.connect((url.netloc.split(":")[0], url.port)) else: conn = HTTPConnection(url.netloc) request_path = "%s?%s" % (url.path, url.query) conn.request("POST", request_path, query) response = conn.getresponse() results = json.loads(response.read()) self._last_results = results
def __init__(self, options=warcprox.Options()): parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self._ensure_db_table() self.options = options
def __init__(self): self.rethinker = doublethink.Rethinker( db="trough_configuration", servers=settings['RETHINKDB_HOSTS']) self.services = doublethink.ServiceRegistry(self.rethinker) self.registry = trough.sync.HostRegistry(rethinker=self.rethinker, services=self.services) trough.sync.init(self.rethinker)
def test_hashtag_seed(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # no hash tag site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) assert site.scope['surt'] == 'http://(org,example,)/' pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert pages[0].url == 'http://example.org/' assert not pages[0].hashtags # yes hash tag site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) brozzler.new_site(frontier, site) assert site.scope['surt'] == 'http://(org,example,)/' pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert pages[0].url == 'http://example.org/' assert pages[0].hashtags == ['#hash',]
def __init__(self, rethinkdb_trough_db_url, promotion_interval=None): ''' TroughClient constructor Args: rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to trough configuration database promotion_interval: if specified, `TroughClient` will spawn a thread that "promotes" (pushed to hdfs) "dirty" trough segments (segments that have received writes) periodically, sleeping for `promotion_interval` seconds between cycles (default None) ''' parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.svcreg = doublethink.ServiceRegistry(self.rr) self._write_url_cache = {} self._read_url_cache = {} self._dirty_segments = set() self._dirty_segments_lock = threading.RLock() self.promotion_interval = promotion_interval self._promoter_thread = None if promotion_interval: self._promoter_thread = threading.Thread( target=self._promotrix, name='TroughClient-promoter') self._promoter_thread.setDaemon(True) self._promoter_thread.start()
def test_seed_redirect(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the pages table pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 pages.sort(key=lambda page: page.hops_from_seed) assert pages[0].hops_from_seed == 0 assert pages[0].url == seed_url assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port # check that scope has been updated properly assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
def test_time_limit(httpd): test_id = 'test_time_limit-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) # create a new job with one seed that could be crawled forever job_conf = { 'seeds': [{ 'url': make_url(httpd, '/infinite/foo/'), 'time_limit': 20 }] } job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert len(sites) == 1 site = sites[0] # time limit should be enforced pretty soon start = time.time() while not sites[0].status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) sites[0].refresh() assert sites[0].status == 'FINISHED_TIME_LIMIT' # all sites finished so job should be finished too start = time.time() job.refresh() while not job.status == 'FINISHED' and time.time() - start < 10: time.sleep(0.5) job.refresh() assert job.status == 'FINISHED'
def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site7/' % httpd.server_port site = brozzler.Site( rr, { 'seed': seed_url, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == [ 'http://localhost:%s/site7/foo.html' % httpd.server_port ] assert not pages[0].hashtags assert pages[ 1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == [ '#boosh', '#ignored', '#whee', ] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id': test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD' } assert seed_url in captures_by_url assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url assert 'screenshot:%s' % seed_url in captures_by_url assert 'thumbnail:%s' % seed_url in captures_by_url assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
def test_seed_page(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/a/'}) site.save() assert frontier.seed_page(site.id) is None page1 = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/b/', 'hops_from_seed': 1 }) page1.save() assert frontier.seed_page(site.id) is None page0 = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/', 'hops_from_seed': 0 }) page0.save() assert frontier.seed_page(site.id) == page0
def test_obey_robots(httpd): test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site( rr, { 'seed': make_url(httpd, '/site1/'), 'user_agent': 'im a badbot', # robots.txt blocks badbot 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) # so we can examine rethinkdb before it does anything try: stop_service('brozzler-worker') assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None site_pages = list(frontier.site_pages(site.id)) assert len(site_pages) == 1 assert site_pages[0].url == site.seed assert site_pages[0].needs_robots_check finally: start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that only the one page is in rethinkdb pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] assert page.url == make_url(httpd, '/site1/') assert page.blocked_by_robots # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = make_url(httpd, '/robots.txt') captures = list(rr.table('captures').filter({'test_id': test_id}).run()) assert len(captures) == 1 assert captures[0]['url'] == robots_url # check pywb t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url) expected_payload = open( os.path.join(os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read() assert requests.get(wb_url, allow_redirects=False).content == expected_payload
def service_registry(options): if options.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) return doublethink.ServiceRegistry(rr, table=parsed.table) else: return None
def setUp(self): self.rethinker = doublethink.Rethinker( db=random_db, servers=settings['RETHINKDB_HOSTS']) self.services = doublethink.ServiceRegistry(self.rethinker) self.registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services) self.snakebite_client = mock.Mock() self.rethinker.table("services").delete().run()
def setUp(self): self.rethinker = doublethink.Rethinker( db=random_db, servers=settings['RETHINKDB_HOSTS']) self.services = doublethink.ServiceRegistry(self.rethinker) sync.init(self.rethinker) self.rethinker.table("services").delete().run() self.rethinker.table("lock").delete().run() self.rethinker.table("assignment").delete().run()
def __init__(self, options=warcprox.Options()): StatsProcessor.__init__(self, options) parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.replicas = min(3, len(self.rr.servers))
def test_login(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site( rr, { 'seed': 'http://localhost:%s/site2/' % httpd.server_port, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } }, 'username': '******', 'password': '******' }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port captures = list( rr.table('captures').filter({ 'test_id': test_id }).order_by('timestamp').run()) meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] # there are several forms in in htdocs/site2/login.html but only one # that brozzler's heuristic should match and try to submit, and it has # action='00', so we can check for that here assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url # sanity check the rest of the crawl assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ( 'WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ( 'WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
def _init_brozzler_worker(self, args): rr = doublethink.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, chrome_exe=args.chrome_exe, proxy='%s:%s' % self.warcprox_controller.proxy.server_address, max_browsers=args.max_browsers) return worker
def test_time_limit(): # XXX test not thoroughly adapted to change in time accounting, since # starts_and_stops is no longer used to enforce time limits # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { 'seed': 'http://example.com/', 'time_limit': 99999 }) brozzler.new_site(frontier, site) site.refresh() # get it back from the db assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None # no time limit set frontier.enforce_time_limit(site) site.time_limit = 10 site.claimed = True site.save() # time limit not reached yet frontier.enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.time_limit = 0.1 time.sleep(0.1) with pytest.raises(brozzler.ReachedTimeLimit): frontier.enforce_time_limit(site)
def test_time_limit(): # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { 'seed': 'http://example.com/', 'time_limit': 99999 }) brozzler.new_site(frontier, site) site.refresh() # get it back from the db assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None # time limit not reached yet frontier._enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.time_limit = 0.1 site.claimed = True site.save() time.sleep(0.1) frontier._enforce_time_limit(site) assert site.status == 'FINISHED_TIME_LIMIT' assert not site.claimed assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
def test_redirect_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = make_url(httpd, '/site9/') site = brozzler.Site( rr, { 'seed': seed_url, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == [ make_url(httpd, '/site9/redirect.html') ] assert not pages[0].hashtags assert pages[1].url == make_url(httpd, '/site9/redirect.html') assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == [ '#hash1', '#hash2', ] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id': test_id}).run() redirect_captures = [ c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET' ] assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
def test_max_claimed_sites(): # max_claimed_sites is a brozzler job setting that puts a cap on the number # of the job's sites that can be brozzled simultaneously across the cluster rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # clean slate rr.table('jobs').delete().run() rr.table('sites').delete().run() job_conf = { 'seeds': [ { 'url': 'http://example.com/1' }, { 'url': 'http://example.com/2' }, { 'url': 'http://example.com/3' }, { 'url': 'http://example.com/4' }, { 'url': 'http://example.com/5' }, ], 'max_claimed_sites': 3, } job = brozzler.new_job(frontier, job_conf) assert job.id assert job.max_claimed_sites == 3 sites = list(frontier.job_sites(job.id)) assert len(sites) == 5 claimed_sites = frontier.claim_sites(1) assert len(claimed_sites) == 1 claimed_sites = frontier.claim_sites(3) assert len(claimed_sites) == 2 with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites(3) # clean slate for the next one rr.table('jobs').delete().run() rr.table('sites').delete().run()
def __init__(self, options=warcprox.Options()): parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_big_table_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.options = options self._ensure_db_table() self._stop = threading.Event() self._batch_lock = threading.RLock() with self._batch_lock: self._batch = [] self._timer = None
def test_ydl_stitching(httpd): test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site( rr, { 'seed': make_url(httpd, '/site10/'), 'warcprox_meta': { 'warc-prefix': 'test_ydl_stitching', 'captures-table-extra-fields': { 'test_id': test_id } } }) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check page.videos pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] while time.time() - start < 600 and not page.videos: time.sleep(0.5) page.refresh() assert len(page.videos) == 6 stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/') assert { 'blame': 'youtube-dl', 'content-length': 267900, 'content-type': 'video/mp4', 'response_code': 204, 'url': stitched_url, } in page.videos time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = list(rr.table('captures').filter({'test_id': test_id}).run()) l = [c for c in captures if c['url'] == stitched_url] assert len(l) == 1 c = l[0] assert c['filename'].startswith('test_ydl_stitching') assert c['content_type'] == 'video/mp4' assert c['http_method'] == 'WARCPROX_WRITE_RECORD'
def test_combos(): rr = doublethink.Rethinker(['127.0.0.1']) try: warcprox.main.ensure_rethinkdb_tables([ 'warcprox-ensure-rethinkdb-tables', '--rethinkdb-stats-url=rethinkdb://127.0.0.1/db00/stats', '--rethinkdb-trough-db-url=rethinkdb://127.0.0.1/db01', ]) assert rr.db('db00').table_list().run() == ['stats'] assert rr.db('db01').table_list().run() == ['services'] # ['assignment', 'lock', 'schema', 'services'] finally: rr.db_drop('db00').run() rr.db_drop('db01').run()
def test_individual_options(): rr = doublethink.Rethinker(['127.0.0.1']) try: warcprox.main.ensure_rethinkdb_tables([ 'warcprox-ensure-rethinkdb-tables', '--rethinkdb-stats-url=rethinkdb://127.0.0.1/db0/stats' ]) assert rr.db('db0').table_list().run() == ['stats'] finally: rr.db_drop('db0').run() try: warcprox.main.ensure_rethinkdb_tables([ 'warcprox-ensure-rethinkdb-tables', '--rethinkdb-services-url=rethinkdb://127.0.0.1/db1/services' ]) assert rr.db('db1').table_list().run() == ['services'] finally: rr.db_drop('db1').run() try: warcprox.main.ensure_rethinkdb_tables([ 'warcprox-ensure-rethinkdb-tables', '--rethinkdb-dedup-url=rethinkdb://127.0.0.1/db2/dedup' ]) assert rr.db('db2').table_list().run() == ['dedup'] finally: rr.db_drop('db2').run() try: warcprox.main.ensure_rethinkdb_tables([ 'warcprox-ensure-rethinkdb-tables', '--rethinkdb-big-table-url=rethinkdb://127.0.0.1/db3/captures' ]) assert rr.db('db3').table_list().run() == ['captures'] finally: rr.db_drop('db3').run() try: warcprox.main.ensure_rethinkdb_tables([ 'warcprox-ensure-rethinkdb-tables', '--rethinkdb-trough-db-url=rethinkdb://127.0.0.1/db4' ]) assert rr.db('db4').table_list().run() == ['services'] # ['assignment', 'lock', 'schema', 'services'] finally: rr.db_drop('db4').run()
def _do_write(self, query): # send provision query to server if not self._write_url. # after send provision query, set self._write_url. # send query to server, return JSON rethinker = doublethink.Rethinker(db="trough_configuration", servers=self.rethinkdb) services = doublethink.ServiceRegistry(rethinker) master_node = services.unique_service('trough-sync-master') logging.info('master_node=%r', master_node) if not master_node: raise Exception( 'no healthy trough-sync-master in service registry') if not self._write_url: buffer = BytesIO() c = pycurl.Curl() c.setopt(c.URL, master_node.get('url')) c.setopt(c.POSTFIELDS, self.database) if self.proxy: c.setopt(pycurl.PROXY, self.proxy) c.setopt(pycurl.PROXYPORT, int(self.proxy_port)) c.setopt(pycurl.PROXYTYPE, self.proxy_type) c.setopt(c.WRITEDATA, buffer) c.perform() c.close() self._write_url = buffer.getvalue() logging.info('self._write_url=%r', self._write_url) buffer = BytesIO() c = pycurl.Curl() c.setopt(c.URL, self._write_url) c.setopt(c.POSTFIELDS, query) if self.proxy: c.setopt(pycurl.PROXY, self.proxy) c.setopt(pycurl.PROXYPORT, int(self.proxy_port)) c.setopt(pycurl.PROXYTYPE, self.proxy_type) c.setopt(c.WRITEDATA, buffer) c.perform() c.close() response = buffer.getvalue() if response.strip() != b'OK': raise Exception( 'Trough Query Failed: Database: %r Response: %r Query: %.200r' % (self.database, response, query)) self._last_results = None
def test_claim_site(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) rr.table('sites').delete().run() # clean slate with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id assert claimed_site.claimed assert claimed_site.last_claimed >= doublethink.utcnow( ) - datetime.timedelta(minutes=1) with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed less than 1 hour ago still not to be reclaimed claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta( minutes=55) claimed_site.save() with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed more than 1 hour ago can be reclaimed site = claimed_site claimed_site = None site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) site.save() claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id # clean up rr.table('sites').get(claimed_site.id).delete().run()
def test_scope_and_schedule_outlinks(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/'}) parent_page = brozzler.Page(rr, { 'hops_from_seed': 1, 'url': 'http://example.com/whatever' }) outlinks = [ 'https://example.com/', 'https://example.com/foo', 'http://example.com/bar', 'HTtp://exAMPle.COm/bar', 'HTtp://exAMPle.COm/BAr', 'HTtp://exAMPle.COm/BAZZZZ', ] orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert sorted(parent_page.outlinks['rejected']) == [ 'https://example.com/', 'https://example.com/foo' ] assert sorted(parent_page.outlinks['accepted']) == [ 'http://example.com/BAZZZZ', 'http://example.com/BAr', 'http://example.com/bar' ] assert parent_page.outlinks['blocked'] == [] pp = brozzler.Page.load(rr, parent_page.id) assert pp == parent_page for url in parent_page.outlinks['rejected']: id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id) is None for url in parent_page.outlinks['accepted']: id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id)