Exemplo n.º 1
0
def test_420(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/420' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.ReachedLimit) as excinfo:
            browser.browse_page(url)
        assert excinfo.value.warcprox_meta == WARCPROX_META_420
Exemplo n.º 2
0
def test_420(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/420' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.ReachedLimit) as excinfo:
            browser.browse_page(url)
        assert excinfo.value.warcprox_meta == WARCPROX_META_420
Exemplo n.º 3
0
def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site6/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 2
    assert page.videos[0] == {
        'blame': 'youtube-dl',
        'response_code': 200,
        'content-length': 383631,
        'content-type': 'video/mp4',
        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
    }
    assert page.videos[1] == {
        'blame': 'browser',
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
        'response_code': 200,
        'content-length': 229455,
        'content-type': 'video/webm',
        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
    }
Exemplo n.º 4
0
def test_js_dialogs(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        # before commit d2ed6b97a24 these would hang and eventually raise
        # brozzler.browser.BrowsingTimeout, which would cause this test to fail
        browser.browse_page('http://localhost:%s/site4/alert.html' %
                            httpd.server_port)
        browser.browse_page('http://localhost:%s/site4/confirm.html' %
                            httpd.server_port)
        browser.browse_page('http://localhost:%s/site4/prompt.html' %
                            httpd.server_port)
Exemplo n.º 5
0
def test_js_dialogs(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        # before commit d2ed6b97a24 these would hang and eventually raise
        # brozzler.browser.BrowsingTimeout, which would cause this test to fail
        browser.browse_page(
                'http://localhost:%s/site4/alert.html' % httpd.server_port)
        browser.browse_page(
                'http://localhost:%s/site4/confirm.html' % httpd.server_port)
        browser.browse_page(
                'http://localhost:%s/site4/prompt.html' % httpd.server_port)
Exemplo n.º 6
0
def test_on_response(httpd):
    response_urls = []
    def on_response(msg):
        response_urls.append(msg['params']['response']['url'])

    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/site3/page.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(url, on_response=on_response)
    assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
    assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
    assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
Exemplo n.º 7
0
def test_on_response(httpd):
    response_urls = []
    def on_response(msg):
        response_urls.append(msg['params']['response']['url'])

    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/site3/page.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(url, on_response=on_response)
    assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
    assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
    assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
Exemplo n.º 8
0
def test_extract_outlinks(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(None, {
        'url':'http://localhost:%s/site8/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        outlinks = worker.brozzle_page(browser, site, page)
    assert outlinks == {
        'http://example.com/offsite',
        'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
    }
Exemplo n.º 9
0
def test_extract_outlinks(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site8/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        outlinks = worker.brozzle_page(browser, site, page)
    assert outlinks == {
        'http://example.com/offsite',
        'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
    }
Exemplo n.º 10
0
def test_try_login(httpd):
    """Test try_login behavior.
    """
    response_urls = []

    def on_response(msg):
        response_urls.append(msg['params']['response']['url'])

    chrome_exe = brozzler.suggest_default_chrome_exe()
    form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port
    favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port
    login_url = 'http://localhost:%s/login-action' % httpd.server_port
    # When username and password are defined and initial page has login form,
    # detect login form, submit login, and then return to the initial page.
    username = '******'
    password = '******'
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(form_url,
                            username=username,
                            password=password,
                            on_response=on_response)
    assert len(response_urls) == 4
    assert response_urls[0] == form_url
    assert response_urls[1] == favicon_url
    assert response_urls[2] == login_url
    assert response_urls[3] == form_url

    # When username and password are not defined, just load the initial page.
    response_urls = []
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(form_url, on_response=on_response)
    assert len(response_urls) == 2
    assert response_urls[0] == form_url
    assert response_urls[1] == favicon_url

    # when the page doesn't have a form with username/password, don't submit it
    response_urls = []
    form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(form_without_login_url,
                            username=username,
                            password=password,
                            on_response=on_response)
    assert len(response_urls) == 2
    assert response_urls[0] == form_without_login_url
    assert response_urls[1] == favicon_url
Exemplo n.º 11
0
def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(None, {
        'url':'http://localhost:%s/site6/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 4
    assert page.videos[0] == {
        'blame': 'youtube-dl',
        'response_code': 200,
        'content-length': 383631,
        'content-type': 'video/mp4',
        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
    }
    assert page.videos[1] == {
        'blame': 'youtube-dl',
        'content-length': 92728,
        'content-type': 'video/webm',
        'response_code': 200,
        'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port
    }
    assert page.videos[2] == {
        'blame': 'youtube-dl',
        'content-length': 101114,
        'content-type': 'video/webm',
        'response_code': 200,
        'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port
    }
    assert page.videos[3] == {
        'blame': 'browser',
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
        'response_code': 200,
        'content-length': 229455,
        'content-type': 'video/webm',
        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
    }
Exemplo n.º 12
0
def test_proxy_down():
    '''
    Test that browsing raises `brozzler.ProxyError` when proxy is down.

    See also `test_proxy_down` in test_units.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in ('127.0.0.1:4',
                                '127.0.0.1:%s' % sock.getsockname()[1]):
        site = brozzler.Site(None, {'seed': 'http://example.com/'})
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        worker = brozzler.BrozzlerWorker(frontier=None,
                                         proxy=not_listening_proxy)
        chrome_exe = brozzler.suggest_default_chrome_exe()

        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
            with pytest.raises(brozzler.ProxyError):
                worker.brozzle_page(browser, site, page)
Exemplo n.º 13
0
def test_proxy_down():
    '''
    Test that browsing raises `brozzler.ProxyError` when proxy is down.

    See also `test_proxy_down` in test_units.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in (
            '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
        site = brozzler.Site(None, {'seed':'http://example.com/'})
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        worker = brozzler.BrozzlerWorker(
                frontier=None, proxy=not_listening_proxy)
        chrome_exe = brozzler.suggest_default_chrome_exe()

        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
            with pytest.raises(brozzler.ProxyError):
                worker.brozzle_page(browser, site, page)
Exemplo n.º 14
0
def _test_proxy_setting(
        httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
    test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % (
            proxy, warcprox_auto, is_warcprox,
            datetime.datetime.utcnow().isoformat())

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/site1/' % httpd.server_port
    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    robots = 'http://localhost:%s/robots.txt' % httpd.server_port

    rr = doublethink.Rethinker('localhost', db='brozzler')
    service_registry = doublethink.ServiceRegistry(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.id is None
    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=1,
            chrome_exe=brozzler.suggest_default_chrome_exe(),
            warcprox_auto=warcprox_auto, proxy=proxy)
    browser = worker._browser_pool.acquire()
    worker.brozzle_site(browser, site)
    worker._browser_pool.release(browser)

    # check proxy is set
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy
    site.refresh() # check that these things were persisted
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/site1/' % httpd.server_port,
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    if is_warcprox:
        assert robots in captures_by_url
        assert page1 in captures_by_url
        assert page2 in captures_by_url
        assert 'screenshot:%s' % page1 in captures_by_url
        assert 'thumbnail:%s' % page1 in captures_by_url

        # check pywb
        t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
        wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
        expected_payload = open(os.path.join(
            os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
        assert requests.get(wb_url).content == expected_payload
    else:
        assert captures_by_url == {}
Exemplo n.º 15
0
def test_page_interstitial_exception(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/401' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.PageInterstitialShown):
            browser.browse_page(url)
Exemplo n.º 16
0
def test_page_interstitial_exception(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/401' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.PageInterstitialShown):
            browser.browse_page(url)
Exemplo n.º 17
0
def test_aw_snap_hes_dead_jim():
    chrome_exe = brozzler.suggest_default_chrome_exe()
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.BrowsingException):
            browser.browse_page('chrome://crash')
Exemplo n.º 18
0
def test_aw_snap_hes_dead_jim():
    chrome_exe = brozzler.suggest_default_chrome_exe()
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.BrowsingException):
            browser.browse_page('chrome://crash')
Exemplo n.º 19
0
def _test_proxy_setting(
        httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
    test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % (
            proxy, warcprox_auto, is_warcprox,
            datetime.datetime.utcnow().isoformat())

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/site1/' % httpd.server_port
    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    robots = 'http://localhost:%s/robots.txt' % httpd.server_port

    rr = doublethink.Rethinker('localhost', db='brozzler')
    service_registry = doublethink.ServiceRegistry(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.id is None
    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=1,
            chrome_exe=brozzler.suggest_default_chrome_exe(),
            warcprox_auto=warcprox_auto, proxy=proxy)
    browser = worker._browser_pool.acquire()
    worker.brozzle_site(browser, site)
    worker._browser_pool.release(browser)

    # check proxy is set
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy
    site.refresh() # check that these things were persisted
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/site1/' % httpd.server_port,
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    if is_warcprox:
        assert robots in captures_by_url
        assert page1 in captures_by_url
        assert page2 in captures_by_url
        assert 'screenshot:%s' % page1 in captures_by_url
        assert 'thumbnail:%s' % page1 in captures_by_url

        # check pywb
        t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
        wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
        expected_payload = open(os.path.join(
            os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
        assert requests.get(wb_url).content == expected_payload
    else:
        assert captures_by_url == {}