def test_420(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/420' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ReachedLimit) as excinfo: browser.browse_page(url) assert excinfo.value.warcprox_meta == WARCPROX_META_420
def test_420(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/420' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ReachedLimit) as excinfo: browser.browse_page(url) assert excinfo.value.warcprox_meta == WARCPROX_META_420
def test_page_videos(httpd): # test depends on behavior of youtube-dl and chromium, could fail and need # to be adjusted on youtube-dl or chromium updates chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site6/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: worker.brozzle_page(browser, site, page) assert page.videos assert len(page.videos) == 2 assert page.videos[0] == { 'blame': 'youtube-dl', 'response_code': 200, 'content-length': 383631, 'content-type': 'video/mp4', 'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port, } assert page.videos[1] == { 'blame': 'browser', # 'response_code': 206, # 'content-range': 'bytes 0-229454/229455', 'response_code': 200, 'content-length': 229455, 'content-type': 'video/webm', 'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, }
def test_js_dialogs(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/site4/alert.html' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: # before commit d2ed6b97a24 these would hang and eventually raise # brozzler.browser.BrowsingTimeout, which would cause this test to fail browser.browse_page('http://localhost:%s/site4/alert.html' % httpd.server_port) browser.browse_page('http://localhost:%s/site4/confirm.html' % httpd.server_port) browser.browse_page('http://localhost:%s/site4/prompt.html' % httpd.server_port)
def test_js_dialogs(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/site4/alert.html' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: # before commit d2ed6b97a24 these would hang and eventually raise # brozzler.browser.BrowsingTimeout, which would cause this test to fail browser.browse_page( 'http://localhost:%s/site4/alert.html' % httpd.server_port) browser.browse_page( 'http://localhost:%s/site4/confirm.html' % httpd.server_port) browser.browse_page( 'http://localhost:%s/site4/prompt.html' % httpd.server_port)
def test_on_response(httpd): response_urls = [] def on_response(msg): response_urls.append(msg['params']['response']['url']) chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/site3/page.html' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(url, on_response=on_response) assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
def test_on_response(httpd): response_urls = [] def on_response(msg): response_urls.append(msg['params']['response']['url']) chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/site3/page.html' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(url, on_response=on_response) assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
def test_extract_outlinks(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page(None, { 'url':'http://localhost:%s/site8/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: outlinks = worker.brozzle_page(browser, site, page) assert outlinks == { 'http://example.com/offsite', 'http://localhost:%s/site8/baz/zuh' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port }
def test_extract_outlinks(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site8/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: outlinks = worker.brozzle_page(browser, site, page) assert outlinks == { 'http://example.com/offsite', 'http://localhost:%s/site8/baz/zuh' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port }
def test_try_login(httpd): """Test try_login behavior. """ response_urls = [] def on_response(msg): response_urls.append(msg['params']['response']['url']) chrome_exe = brozzler.suggest_default_chrome_exe() form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port login_url = 'http://localhost:%s/login-action' % httpd.server_port # When username and password are defined and initial page has login form, # detect login form, submit login, and then return to the initial page. username = '******' password = '******' with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(form_url, username=username, password=password, on_response=on_response) assert len(response_urls) == 4 assert response_urls[0] == form_url assert response_urls[1] == favicon_url assert response_urls[2] == login_url assert response_urls[3] == form_url # When username and password are not defined, just load the initial page. response_urls = [] with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(form_url, on_response=on_response) assert len(response_urls) == 2 assert response_urls[0] == form_url assert response_urls[1] == favicon_url # when the page doesn't have a form with username/password, don't submit it response_urls = [] form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(form_without_login_url, username=username, password=password, on_response=on_response) assert len(response_urls) == 2 assert response_urls[0] == form_without_login_url assert response_urls[1] == favicon_url
def test_page_videos(httpd): # test depends on behavior of youtube-dl and chromium, could fail and need # to be adjusted on youtube-dl or chromium updates chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page(None, { 'url':'http://localhost:%s/site6/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: worker.brozzle_page(browser, site, page) assert page.videos assert len(page.videos) == 4 assert page.videos[0] == { 'blame': 'youtube-dl', 'response_code': 200, 'content-length': 383631, 'content-type': 'video/mp4', 'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port, } assert page.videos[1] == { 'blame': 'youtube-dl', 'content-length': 92728, 'content-type': 'video/webm', 'response_code': 200, 'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port } assert page.videos[2] == { 'blame': 'youtube-dl', 'content-length': 101114, 'content-type': 'video/webm', 'response_code': 200, 'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port } assert page.videos[3] == { 'blame': 'browser', # 'response_code': 206, # 'content-range': 'bytes 0-229454/229455', 'response_code': 200, 'content-length': 229455, 'content-type': 'video/webm', 'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, }
def test_proxy_down(): ''' Test that browsing raises `brozzler.ProxyError` when proxy is down. See also `test_proxy_down` in test_units.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ('127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): site = brozzler.Site(None, {'seed': 'http://example.com/'}) page = brozzler.Page(None, {'url': 'http://example.com/'}) worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ProxyError): worker.brozzle_page(browser, site, page)
def test_proxy_down(): ''' Test that browsing raises `brozzler.ProxyError` when proxy is down. See also `test_proxy_down` in test_units.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ( '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): site = brozzler.Site(None, {'seed':'http://example.com/'}) page = brozzler.Page(None, {'url': 'http://example.com/'}) worker = brozzler.BrozzlerWorker( frontier=None, proxy=not_listening_proxy) chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ProxyError): worker.brozzle_page(browser, site, page)
def _test_proxy_setting( httpd, proxy=None, warcprox_auto=False, is_warcprox=False): test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % ( proxy, warcprox_auto, is_warcprox, datetime.datetime.utcnow().isoformat()) # the two pages we expect to be crawled page1 = 'http://localhost:%s/site1/' % httpd.server_port page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port robots = 'http://localhost:%s/robots.txt' % httpd.server_port rr = doublethink.Rethinker('localhost', db='brozzler') service_registry = doublethink.ServiceRegistry(rr) site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=1, chrome_exe=brozzler.suggest_default_chrome_exe(), warcprox_auto=warcprox_auto, proxy=proxy) browser = worker._browser_pool.acquire() worker.brozzle_site(browser, site) worker._browser_pool.release(browser) # check proxy is set assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy site.refresh() # check that these things were persisted assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/site1/' % httpd.server_port, 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} if is_warcprox: assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload else: assert captures_by_url == {}
def test_page_interstitial_exception(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/401' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.PageInterstitialShown): browser.browse_page(url)
def test_page_interstitial_exception(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/401' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.PageInterstitialShown): browser.browse_page(url)
def test_aw_snap_hes_dead_jim(): chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.BrowsingException): browser.browse_page('chrome://crash')
def test_aw_snap_hes_dead_jim(): chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.BrowsingException): browser.browse_page('chrome://crash')
def _test_proxy_setting( httpd, proxy=None, warcprox_auto=False, is_warcprox=False): test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % ( proxy, warcprox_auto, is_warcprox, datetime.datetime.utcnow().isoformat()) # the two pages we expect to be crawled page1 = 'http://localhost:%s/site1/' % httpd.server_port page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port robots = 'http://localhost:%s/robots.txt' % httpd.server_port rr = doublethink.Rethinker('localhost', db='brozzler') service_registry = doublethink.ServiceRegistry(rr) site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=1, chrome_exe=brozzler.suggest_default_chrome_exe(), warcprox_auto=warcprox_auto, proxy=proxy) browser = worker._browser_pool.acquire() worker.brozzle_site(browser, site) worker._browser_pool.release(browser) # check proxy is set assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy site.refresh() # check that these things were persisted assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/site1/' % httpd.server_port, 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} if is_warcprox: assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload else: assert captures_by_url == {}