示例#1
0
def test_seed_page():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    site = brozzler.Site(rr, {'seed': 'http://example.com/a/'})
    site.save()

    assert frontier.seed_page(site.id) is None

    page1 = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/b/',
        'hops_from_seed': 1
    })
    page1.save()

    assert frontier.seed_page(site.id) is None

    page0 = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/',
        'hops_from_seed': 0
    })
    page0.save()

    assert frontier.seed_page(site.id) == page0
示例#2
0
def test_scoping():
    test_scope = yaml.load('''
max_hops: 100
accepts:
- url_match: REGEX_MATCH
  value: ^.*/audio_file/.*\.mp3$
- url_match: SURT_MATCH
  value: http://(com,vimeocdn,
- url_match: STRING_MATCH
  value: ec-media.soundcloud.com
- regex: ^https?://twitter\.com.*$
- substring: facebook.com
- regex: ^https?://(www.)?youtube.com/watch?.*$
  parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
blocks:
- domain: twitter.com
  url_match: REGEX_MATCH
  value: ^.*lang=(?!en).*$
''')

    site = brozzler.Site(
        None, {
            'id': 1,
            'seed': 'http://example.com/foo/bar?baz=quux#monkey',
            'scope': test_scope
        })
    page = brozzler.Page(None, {
        'url': 'http://example.com/foo/bar?baz=quux#monkey',
        'site_id': site.id
    })

    assert site.is_in_scope('http://example.com/foo/bar', page)
    assert not site.is_in_scope('http://example.com/foo/baz', page)

    assert not site.is_in_scope('http://foo.com/some.mp3', page)
    assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)

    assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
    assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)

    assert site.is_in_scope('https://twitter.com/twit', page)
    assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
    assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)

    assert site.is_in_scope('https://www.facebook.com/whatevz', page)

    assert not site.is_in_scope('https://www.youtube.com/watch?v=dUIn5OAPS5s',
                                page)
    yt_user_page = brozzler.Page(
        None, {
            'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
            'site_id': site.id,
            'hops_from_seed': 10
        })
    assert site.is_in_scope('https://www.youtube.com/watch?v=dUIn5OAPS5s',
                            yt_user_page)
示例#3
0
def new_site(frontier, site):
    site.id = str(uuid.uuid4())
    logging.info("new site {}".format(site))
    # insert the Page into the database before the Site, to avoid situation
    # where a brozzler worker immediately claims the site, finds no pages
    # to crawl, and decides the site is finished
    try:
        url = urlcanon.parse_url(site.seed)
        hashtag = (url.hash_sign + url.fragment).decode("utf-8")
        urlcanon.canon.remove_fragment(url)
        page = brozzler.Page(
            frontier.rr, {
                "url": str(url),
                "site_id": site.get("id"),
                "job_id": site.get("job_id"),
                "hops_from_seed": 0,
                "priority": 1000,
                "needs_robots_check": True
            })
        if hashtag:
            page.hashtags = [
                hashtag,
            ]
        page.save()
        logging.info("queued page %s", page)
    finally:
        # finally block because we want to insert the Site no matter what
        site.save()
示例#4
0
 def _build_fresh_pages(self, site, parent_page, urls):
     '''
     Returns a dict of page_id => brozzler.Page.
     '''
     pages = {}
     for url in urls:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         hashtag = (url_for_crawling.hash_sign +
                    url_for_crawling.fragment).decode('utf-8')
         urlcanon.canon.remove_fragment(url_for_crawling)
         if not url_for_scoping.surt().startswith(
                 site.scope['surt'].encode('utf-8')):
             hops_off_surt = parent_page.hops_off_surt + 1
         else:
             hops_off_surt = 0
         page = brozzler.Page(
             self.rr, {
                 'url': str(url_for_crawling),
                 'site_id': site.id,
                 'job_id': site.job_id,
                 'hops_from_seed': parent_page.hops_from_seed + 1,
                 'via_page_id': parent_page.id,
                 'hops_off_surt': hops_off_surt,
                 'hashtags': []
             })
         if page.id in pages:
             pages[page.id].priority += page.priority
             page = pages[page.id]
         else:
             pages[page.id] = page
         if hashtag:
             page.hashtags = list(set(page.hashtags + [hashtag]))
     return pages
示例#5
0
def test_needs_browsing():
    # only one test case here right now, which exposed a bug

    class ConvenientHeaders(http.client.HTTPMessage):
        def __init__(self, headers):
            http.client.HTTPMessage.__init__(self)
            for (k, v) in headers.items():
                self.add_header(k, v)

    page = brozzler.Page(None, {
        'url':'http://example.com/a'})

    spy = brozzler.ydl.YoutubeDLSpy()
    spy.fetches.append({
        'url': 'http://example.com/a',
        'method': 'HEAD',
        'response_code': 301,
        'response_headers': ConvenientHeaders({'Location': '/b'})})
    spy.fetches.append({
        'url': 'http://example.com/b',
        'method': 'GET',
        'response_code': 200,
        'response_headers': ConvenientHeaders({
            'Content-Type': 'application/pdf'})})

    assert not brozzler.worker.BrozzlerWorker._needs_browsing(
            None, page, spy.fetches)
示例#6
0
 def _build_fresh_page(self, site, parent_page, url, hops_off=0):
     url_for_scoping = urlcanon.semantic(url)
     url_for_crawling = urlcanon.whatwg(url)
     hashtag = (url_for_crawling.hash_sign +
                url_for_crawling.fragment).decode('utf-8')
     urlcanon.canon.remove_fragment(url_for_crawling)
     page = brozzler.Page(
         self.rr, {
             'url':
             str(url_for_crawling),
             'site_id':
             site.id,
             'job_id':
             site.job_id,
             'hops_from_seed':
             parent_page.hops_from_seed + 1,
             'hop_path':
             str(parent_page.hop_path if parent_page.hop_path else "") +
             "L",
             'via_page_id':
             parent_page.id,
             'via_page_url':
             parent_page.url,
             'hops_off_surt':
             hops_off,
             'hashtags': [hashtag] if hashtag else []
         })
     return page
示例#7
0
def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site6/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 2
    assert page.videos[0] == {
        'blame': 'youtube-dl',
        'response_code': 200,
        'content-length': 383631,
        'content-type': 'video/mp4',
        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
    }
    assert page.videos[1] == {
        'blame': 'browser',
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
        'response_code': 200,
        'content-length': 229455,
        'content-type': 'video/webm',
        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
    }
示例#8
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        decisions = {'accepted': set(), 'blocked': set(), 'rejected': set()}
        counts = {'added': 0, 'updated': 0, 'rejected': 0, 'blocked': 0}

        in_scope, blocked, out_of_scope = self._scope_and_enforce_robots(
            site, parent_page, outlinks)
        decisions['blocked'] = blocked
        decisions['rejected'] = out_of_scope
        counts['blocked'] += len(blocked)
        counts['rejected'] += len(out_of_scope)

        fresh_pages = self._build_fresh_pages(site, parent_page, in_scope)

        # get existing pages from rethinkdb
        results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
        pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}

        # build list of pages to save, consisting of new pages, and existing
        # pages updated with higher priority and new hashtags
        for fresh_page in fresh_pages.values():
            decisions['accepted'].add(fresh_page.url)
            if fresh_page.id in pages:
                page = pages[fresh_page.id]
                page.hashtags = list(
                    set((page.hashtags or []) + fresh_page.hashtags))
                page.priority += fresh_page.priority
                counts['updated'] += 1
            else:
                pages[fresh_page.id] = fresh_page
                counts['added'] += 1

        # insert/replace in batches of 50 to try to avoid this error:
        # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
        # there can be many pages and each one can be very large (many videos,
        # in and out of scope links, etc)
        l = list(pages.values())
        for batch in (l[i:i + 50] for i in range(0, len(l), 50)):
            try:
                self.logger.debug('inserting/replacing batch of %s pages',
                                  len(batch))
                result = self.rr.table('pages').insert(
                    batch, conflict='replace').run()
            except Exception as e:
                self.logger.error(
                    'problem inserting/replacing batch of %s pages',
                    len(batch),
                    exc_info=True)

        parent_page.outlinks = {}
        for k in decisions:
            parent_page.outlinks[k] = list(decisions[k])
        parent_page.save()

        self.logger.info(
            '%s new links added, %s existing links updated, %s links '
            'rejected, %s links blocked by robots from %s', counts['added'],
            counts['updated'], counts['rejected'], counts['blocked'],
            parent_page)
示例#9
0
 def site_pages(self, site_id, unbrozzled_only=False):
     results = self.r.table("pages").between(
             [site_id, 0 if unbrozzled_only else self.r.minval,
                 self.r.minval, self.r.minval],
             [site_id, 0 if unbrozzled_only else self.r.maxval,
                 self.r.maxval, self.r.maxval],
             index="priority_by_site").run()
     for result in results:
         yield brozzler.Page(**result)
示例#10
0
def new_seed_page(frontier, site):
    url = urlcanon.parse_url(site.seed)
    hashtag = (url.hash_sign + url.fragment).decode("utf-8")
    urlcanon.canon.remove_fragment(url)
    page = brozzler.Page(frontier.rr, {
        "url": str(url), "site_id": site.get("id"),
        "job_id": site.get("job_id"), "hops_from_seed": 0,
        "priority": 1000, "needs_robots_check": True})
    if hashtag:
        page.hashtags = [hashtag,]
    return page
示例#11
0
 def seed_page(self, site_id):
     results = self.rr.table("pages").between(
             [site_id, r.minval, r.minval, r.minval],
             [site_id, r.maxval, r.maxval, r.maxval],
             index="priority_by_site").filter({"hops_from_seed":0}).run()
     pages = list(results)
     if len(pages) > 1:
         self.logger.warn(
                 "more than one seed page for site_id %s ?", site_id)
     if len(pages) < 1:
         return None
     return brozzler.Page(self.rr, pages[0])
示例#12
0
def brozzle_page():
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzle-page - brozzle a single page',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None,
            help='http proxy')
    arg_parser.add_argument(
            '--enable-warcprox-features', dest='enable_warcprox_features',
            action='store_true', help=(
                'enable special features that assume the configured proxy '
                'is warcprox'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            id=-1, seed=args.url, proxy=args.proxy,
            enable_warcprox_features=args.enable_warcprox_features)
    page = brozzler.Page(url=args.url, site_id=site.id)
    worker = brozzler.BrozzlerWorker(frontier=None)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
                ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
                datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    browser.start(proxy=site.proxy)
    try:
        outlinks = worker.brozzle_page(
                browser, site, page, on_screenshot=on_screenshot)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()
示例#13
0
def test_extract_outlinks(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site8/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        outlinks = worker.brozzle_page(browser, site, page)
    assert outlinks == {
        'http://example.com/offsite',
        'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
    }
示例#14
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        if site.remember_outlinks:
            parent_page.outlinks = {
                "accepted": [],
                "blocked": [],
                "rejected": []
            }
        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
        for url in outlinks or []:
            u = brozzler.site.Url(url)
            if site.is_in_scope(u, parent_page=parent_page):
                if brozzler.is_permitted_by_robots(site, url):
                    if not u.surt.startswith(site.scope["surt"]):
                        hops_off_surt = parent_page.hops_off_surt + 1
                    else:
                        hops_off_surt = 0
                    new_child_page = brozzler.Page(
                        url,
                        site_id=site.id,
                        job_id=site.job_id,
                        hops_from_seed=parent_page.hops_from_seed + 1,
                        via_page_id=parent_page.id,
                        hops_off_surt=hops_off_surt)
                    existing_child_page = self.page(new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
                        self.update_page(existing_child_page)
                        counts["updated"] += 1
                    else:
                        self.new_page(new_child_page)
                        counts["added"] += 1
                    if site.remember_outlinks:
                        parent_page.outlinks["accepted"].append(url)
                else:
                    counts["blocked"] += 1
                    if site.remember_outlinks:
                        parent_page.outlinks["blocked"].append(url)
            else:
                counts["rejected"] += 1
                if site.remember_outlinks:
                    parent_page.outlinks["rejected"].append(url)

        if site.remember_outlinks:
            self.update_page(parent_page)

        self.logger.info(
            "%s new links added, %s existing links updated, %s links "
            "rejected, %s links blocked by robots from %s", counts["added"],
            counts["updated"], counts["rejected"], counts["blocked"],
            parent_page)
示例#15
0
def test_proxy_down():
    '''
    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.

    This test needs to cover every possible fetch through the proxy other than
    fetches from the browser. For that, see test_brozzling.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in ('127.0.0.1:4',
                                '127.0.0.1:%s' % sock.getsockname()[1]):
        worker = brozzler.BrozzlerWorker(frontier=None,
                                         proxy=not_listening_proxy)
        site = brozzler.Site(None, {
            'id': str(uuid.uuid4()),
            'seed': 'http://example.com/'
        })
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        # robots.txt fetch
        with pytest.raises(brozzler.ProxyError):
            brozzler.is_permitted_by_robots(site,
                                            'http://example.com/',
                                            proxy=not_listening_proxy)

        # youtube-dl fetch
        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
            ydl = worker._youtube_dl(tempdir, site)
            with pytest.raises(brozzler.ProxyError):
                worker._try_youtube_dl(ydl, site, page)

        # raw fetch
        with pytest.raises(brozzler.ProxyError):
            worker._fetch_url(site, page)

        # WARCPROX_WRITE_RECORD
        with pytest.raises(brozzler.ProxyError):
            worker._warcprox_write_record(
                warcprox_address=not_listening_proxy,
                url='test://proxy_down/warcprox_write_record',
                warc_type='metadata',
                content_type='text/plain',
                payload=b'''payload doesn't matter here''')
示例#16
0
 def site_pages(self, site_id, brozzled=None):
     '''
     Args:
         site_id (str or int):
         brozzled (bool): if true, results include only pages that have
             been brozzled at least once; if false, only pages that have
             not been brozzled; and if None (the default), all pages
     Returns:
         iterator of brozzler.Page
     '''
     results = self.rr.table("pages").between(
         [site_id, 1 if brozzled is True else 0, r.minval, r.minval], [
             site_id, 0 if brozzled is False else r.maxval, r.maxval,
             r.maxval
         ],
         index="priority_by_site").run()
     for result in results:
         yield brozzler.Page(self.rr, result)
示例#17
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
        counts = {'added':0,'updated':0,'rejected':0,'blocked':0}

        in_scope, blocked, out_of_scope = self._scope_and_enforce_robots(
                site, parent_page, outlinks)
        decisions['blocked'] = blocked
        decisions['rejected'] = out_of_scope
        counts['blocked'] += len(blocked)
        counts['rejected'] += len(out_of_scope)

        fresh_pages = self._build_fresh_pages(site, parent_page, in_scope)

        # get existing pages from rethinkdb
        results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
        pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}

        # build list of pages to save, consisting of new pages, and existing
        # pages updated with higher priority and new hashtags
        for fresh_page in fresh_pages.values():
            decisions['accepted'].add(fresh_page.url)
            if fresh_page.id in pages:
                page = pages[fresh_page.id]
                page.hashtags = list(set((page.hashtags or [])
                                         + fresh_page.hashtags))
                page.priority += fresh_page.priority
                counts['updated'] += 1
            else:
                pages[fresh_page.id] = fresh_page
                counts['added'] += 1

        result = self.rr.table('pages').insert(
                pages.values(), conflict='replace').run()

        parent_page.outlinks = {}
        for k in decisions:
            parent_page.outlinks[k] = list(decisions[k])
        parent_page.save()

        self.logger.info(
                '%s new links added, %s existing links updated, %s links '
                'rejected, %s links blocked by robots from %s',
                counts['added'], counts['updated'], counts['rejected'],
                counts['blocked'], parent_page)
示例#18
0
 def claim_page(self, site, worker_id):
     # ignores the "claimed" field of the page, because only one
     # brozzler-worker can be working on a site at a time, and that would
     # have to be the worker calling this method, so if something is claimed
     # already, it must have been left that way because of some error
     result = self.rr.table("pages").between(
         [site.id, 0, r.minval, r.minval], [site.id, 0, r.maxval, r.maxval],
         index="priority_by_site").order_by(
             index=r.desc("priority_by_site")).limit(1).update(
                 {
                     "claimed": True,
                     "last_claimed_by": worker_id
                 },
                 return_changes="always").run()
     self._vet_result(result, unchanged=[0, 1], replaced=[0, 1])
     if result["unchanged"] == 0 and result["replaced"] == 0:
         raise brozzler.NothingToClaim
     else:
         return brozzler.Page(self.rr, result["changes"][0]["new_val"])
示例#19
0
def test_scope_and_schedule_outlinks():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {'seed': 'http://example.com/'})
    parent_page = brozzler.Page(rr, {
        'hops_from_seed': 1,
        'url': 'http://example.com/whatever'
    })
    outlinks = [
        'https://example.com/',
        'https://example.com/foo',
        'http://example.com/bar',
        'HTtp://exAMPle.COm/bar',
        'HTtp://exAMPle.COm/BAr',
        'HTtp://exAMPle.COm/BAZZZZ',
    ]
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots

    assert sorted(parent_page.outlinks['rejected']) == [
        'https://example.com/', 'https://example.com/foo'
    ]
    assert sorted(parent_page.outlinks['accepted']) == [
        'http://example.com/BAZZZZ', 'http://example.com/BAr',
        'http://example.com/bar'
    ]
    assert parent_page.outlinks['blocked'] == []

    pp = brozzler.Page.load(rr, parent_page.id)
    assert pp == parent_page

    for url in parent_page.outlinks['rejected']:
        id = brozzler.Page.compute_id(site.id, url)
        assert brozzler.Page.load(rr, id) is None
    for url in parent_page.outlinks['accepted']:
        id = brozzler.Page.compute_id(site.id, url)
        assert brozzler.Page.load(rr, id)
示例#20
0
def new_site(frontier, site):
    site.id = str(uuid.uuid4())
    logging.info("new site {}".format(site))
    try:
        # insert the Page into the database before the Site, to avoid situation
        # where a brozzler worker immediately claims the site, finds no pages
        # to crawl, and decides the site is finished
        try:
            if brozzler.is_permitted_by_robots(site, site.seed):
                page = brozzler.Page(site.seed,
                                     site_id=site.id,
                                     job_id=site.job_id,
                                     hops_from_seed=0,
                                     priority=1000)
                frontier.new_page(page)
                logging.info("queued page %s", page)
            else:
                logging.warn("seed url %s is blocked by robots.txt", site.seed)
        finally:
            # finally block because we want to insert the Site no matter what
            frontier.new_site(site)
    except brozzler.ReachedLimit as e:
        frontier.reached_limit(site, e)
示例#21
0
def test_proxy_down():
    '''
    Test that browsing raises `brozzler.ProxyError` when proxy is down.

    See also `test_proxy_down` in test_units.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in ('127.0.0.1:4',
                                '127.0.0.1:%s' % sock.getsockname()[1]):
        site = brozzler.Site(None, {'seed': 'http://example.com/'})
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        worker = brozzler.BrozzlerWorker(frontier=None,
                                         proxy=not_listening_proxy)
        chrome_exe = brozzler.suggest_default_chrome_exe()

        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
            with pytest.raises(brozzler.ProxyError):
                worker.brozzle_page(browser, site, page)
示例#22
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
        for url in outlinks or []:
            url_for_scoping = urlcanon.semantic(url)
            url_for_crawling = urlcanon.whatwg(url)
            hashtag = (url_for_crawling.hash_sign +
                       url_for_crawling.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url_for_crawling)
            if site.is_in_scope(url_for_scoping, parent_page=parent_page):
                if brozzler.is_permitted_by_robots(site,
                                                   str(url_for_crawling)):
                    if not url_for_scoping.surt().startswith(
                            site.scope["surt"].encode("utf-8")):
                        hops_off_surt = parent_page.hops_off_surt + 1
                    else:
                        hops_off_surt = 0
                    new_child_page = brozzler.Page(
                        self.rr, {
                            'url': str(url_for_crawling),
                            'site_id': site.id,
                            'job_id': site.job_id,
                            'hops_from_seed': parent_page.hops_from_seed + 1,
                            'via_page_id': parent_page.id,
                            'hops_off_surt': hops_off_surt
                        })
                    existing_child_page = brozzler.Page.load(
                        self.rr, new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
                        if hashtag and existing_child_page.hashtags:
                            hashtags = set(existing_child_page.hashtags)
                            hashtags.add(hashtag)
                            existing_child_page.hashtags = list(hashtags)
                        elif hashtag:
                            existing_child_page.hashtags = [hashtag]
                        existing_child_page.save()
                        counts["updated"] += 1
                    else:
                        if hashtag:
                            new_child_page.hashtags = [
                                hashtag,
                            ]
                        new_child_page.save()
                        counts["added"] += 1
                    decisions["accepted"].add(str(url_for_crawling))
                else:
                    counts["blocked"] += 1
                    decisions["blocked"].add(str(url_for_crawling))
            else:
                counts["rejected"] += 1
                decisions["rejected"].add(str(url_for_crawling))

        parent_page.outlinks = {}
        for k in decisions:
            parent_page.outlinks[k] = list(decisions[k])
        parent_page.save()

        self.logger.info(
            "%s new links added, %s existing links updated, %s links "
            "rejected, %s links blocked by robots from %s", counts["added"],
            counts["updated"], counts["rejected"], counts["blocked"],
            parent_page)
示例#23
0
def test_completed_page():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # redirect that changes scope surt
    site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
    site.save()
    page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/',
        'claimed': True,
        'brozzle_count': 0,
        'hops_from_seed': 0,
        'redirect_url':'http://example.com/b/', })
    page.save()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    frontier.completed_page(site, page)
    assert site.scope == {'surt': 'http://(com,example,)/b/'}
    site.refresh()
    assert site.scope == {'surt': 'http://(com,example,)/b/'}
    assert page.brozzle_count == 1
    assert page.claimed == False
    page.refresh()
    assert page.brozzle_count == 1
    assert page.claimed == False

    # redirect that doesn't change scope surt because destination is covered by
    # the original surt
    site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
    site.save()
    page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/a/',
        'claimed': True,
        'brozzle_count': 0,
        'hops_from_seed': 0,
        'redirect_url':'http://example.com/a/x/', })
    page.save()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    frontier.completed_page(site, page)
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    site.refresh()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    assert page.brozzle_count == 1
    assert page.claimed == False
    page.refresh()
    assert page.brozzle_count == 1
    assert page.claimed == False

    # redirect that doesn't change scope surt because page is not the seed page
    site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
    site.save()
    page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/c/',
        'claimed': True,
        'brozzle_count': 0,
        'hops_from_seed': 1,
        'redirect_url':'http://example.com/d/', })
    page.save()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    frontier.completed_page(site, page)
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    site.refresh()
    assert site.scope == {'surt': 'http://(com,example,)/a/'}
    assert page.brozzle_count == 1
    assert page.claimed == False
    page.refresh()
    assert page.brozzle_count == 1
    assert page.claimed == False
示例#24
0
def test_parent_url_scoping():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # scope rules that look at parent page url should consider both the
    # original url and the redirect url, if any, of the parent page
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/foo/',
        'scope': {
            'accepts': [{
                'parent_url_regex': '^http://example.com/acceptme/.*$'}],
            'blocks': [{
                'parent_url_regex': '^http://example.com/blockme/.*$'}],
            },
        'remember_outlinks': True})
    site.save()

    # an outlink that would not otherwise be in scope
    outlinks = ['https://some-random-url.com/']

    # parent page does not match any parent_url_regex
    parent_page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/foo/spluh'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == outlinks
    assert parent_page.outlinks['accepted'] == []

    # parent page url matches accept parent_url_regex
    parent_page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/acceptme/futz'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == []
    assert parent_page.outlinks['accepted'] == outlinks

    # parent page redirect_url matches accept parent_url_regex
    parent_page_c = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/toot/blah',
        'redirect_url':'http://example.com/acceptme/futz'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == []
    assert parent_page.outlinks['accepted'] == outlinks

    # an outlink that would normally be in scope
    outlinks = ['http://example.com/foo/whatever/']

    # parent page does not match any parent_url_regex
    parent_page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/foo/spluh'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == []
    assert parent_page.outlinks['accepted'] == outlinks

    # parent page url matches block parent_url_regex
    parent_page = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/blockme/futz'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == outlinks
    assert parent_page.outlinks['accepted'] == []

    # parent page redirect_url matches block parent_url_regex
    parent_page_c = brozzler.Page(rr, {
        'site_id': site.id,
        'url': 'http://example.com/toot/blah',
        'redirect_url':'http://example.com/blockme/futz'})
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert parent_page.outlinks['rejected'] == outlinks
    assert parent_page.outlinks['accepted'] == []
示例#25
0
def test_field_defaults():
    rr = doublethink.Rethinker('localhost', db='ignoreme')

    # page
    brozzler.Page.table_ensure(rr)
    page = brozzler.Page(rr, {'hops_from_seed': 3})
    assert page.hops_from_seed == 3
    assert page.id
    assert page.brozzle_count == 0
    page.save()
    assert page.hops_from_seed == 3
    assert page.id
    assert page.brozzle_count == 0

    qage = brozzler.Page.load(rr, page.id)
    assert qage.hops_from_seed == 3
    assert qage.id == page.id
    assert qage.brozzle_count == 0
    qage.save()
    assert qage.hops_from_seed == 3
    assert qage.id == page.id
    assert qage.brozzle_count == 0
    qage.refresh()
    assert qage.hops_from_seed == 3
    assert qage.id == page.id
    assert qage.brozzle_count == 0

    # site
    brozzler.Site.table_ensure(rr)
    site = brozzler.Site(rr, {'seed': 'http://example.com/'})
    assert site.id is None
    assert site.scope
    assert site.scope['surt'] == 'http://(com,example,)/'
    site.save()
    assert site.id
    assert site.scope

    tite = brozzler.Site.load(rr, site.id)
    assert tite.id == site.id
    assert tite.scope == site.scope
    tite.save()
    assert tite.id == site.id
    assert tite.scope == site.scope
    tite.refresh()
    assert tite.id == site.id
    assert tite.scope == site.scope

    # job
    brozzler.Job.table_ensure(rr)
    job = brozzler.Job(rr, {'status': 'WHUUUT'})
    assert job.status == 'WHUUUT'
    assert job.id is None
    assert job.starts_and_stops
    job.save()
    assert job.status == 'WHUUUT'
    assert job.id
    assert job.starts_and_stops

    kob = brozzler.Job.load(rr, job.id)
    assert kob.status == 'WHUUUT'
    assert kob.id
    assert kob.starts_and_stops
    kob.save()
    assert kob.status == 'WHUUUT'
    assert kob.id
    assert kob.starts_and_stops
    kob.refresh()
    assert kob.status == 'WHUUUT'
    assert kob.id
    assert kob.starts_and_stops
示例#26
0
def brozzle_page(argv=None):
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        description='brozzle-page - brozzle a single page',
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument('-e',
                            '--chrome-exe',
                            dest='chrome_exe',
                            default=suggest_default_chrome_exe(),
                            help='executable to use to invoke chrome')
    arg_parser.add_argument(
        '--behavior-parameters',
        dest='behavior_parameters',
        default=None,
        help=('json blob of parameters to populate the javascript behavior '
              'template, e.g. {"parameter_username":"******",'
              '"parameter_password":"******"}'))
    arg_parser.add_argument(
        '--username',
        dest='username',
        default=None,
        help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
        '--password',
        dest='password',
        default=None,
        help='use this password to try to log in if a login form is found')
    arg_parser.add_argument('--proxy',
                            dest='proxy',
                            default=None,
                            help='http proxy')
    arg_parser.add_argument('--skip-extract-outlinks',
                            dest='skip_extract_outlinks',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-visit-hashtags',
                            dest='skip_visit_hashtags',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-youtube-dl',
                            dest='skip_youtube_dl',
                            action='store_true',
                            help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    behavior_parameters = {}
    if args.behavior_parameters:
        behavior_parameters = json.loads(args.behavior_parameters)
    site = brozzler.Site(
        None, {
            'id': -1,
            'seed': args.url,
            'behavior_parameters': behavior_parameters,
            'username': args.username,
            'password': args.password
        })
    page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
    worker = brozzler.BrozzlerWorker(
        frontier=None,
        proxy=args.proxy,
        skip_extract_outlinks=args.skip_extract_outlinks,
        skip_visit_hashtags=args.skip_visit_hashtags,
        skip_youtube_dl=args.skip_youtube_dl)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
            ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
            datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    try:
        browser.start(proxy=args.proxy)
        outlinks = worker.brozzle_page(
            browser,
            site,
            page,
            on_screenshot=on_screenshot,
            enable_youtube_dl=not args.skip_youtube_dl)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()
示例#27
0
 def page(self, id):
     result = self.r.table("pages").get(id).run()
     if result:
         return brozzler.Page(**result)
     else:
         return None