Пример #1
0
    def test_periodic_heartbeat(self):
        controller = self.make_fresh_controller()
        controller.sync_loop_timing = 1
        controller.healthy_service_ids = {
            'trough-read:test01:id0', 'trough-read:test01:id1'
        }
        assert set(self.rethinker.table('services')['id'].run()) == set()

        # first time it inserts individual services
        heartbeats_after = doublethink.utcnow()
        healthy_service_ids = controller.periodic_heartbeat()
        assert set(healthy_service_ids) == {
            'trough-read:test01:id0', 'trough-read:test01:id1'
        }
        assert set(self.rethinker.table('services')['id'].run()) == {
            'trough-nodes:test01:None', 'trough-read:test01:id0',
            'trough-read:test01:id1'
        }
        for svc in self.rethinker.table('services').run():
            assert svc['last_heartbeat'] > heartbeats_after

        # subsequently updates existing services in one bulk query
        heartbeats_after = doublethink.utcnow()
        healthy_service_ids = controller.periodic_heartbeat()
        assert set(healthy_service_ids) == {
            'trough-read:test01:id0', 'trough-read:test01:id1'
        }
        assert set(self.rethinker.table('services')['id'].run()) == {
            'trough-nodes:test01:None', 'trough-read:test01:id0',
            'trough-read:test01:id1'
        }
        for svc in self.rethinker.table('services').run():
            assert svc['last_heartbeat'] > heartbeats_after
Пример #2
0
 def finished(self, site, status):
     self.logger.info("%s %s", status, site)
     site.status = status
     site.claimed = False
     site.last_disclaimed = doublethink.utcnow()
     site.starts_and_stops[-1]["stop"] = doublethink.utcnow()
     site.save()
     if site.job_id:
         self._maybe_finish_job(site.job_id)
Пример #3
0
 def finished(self, site, status):
     self.logger.info("%s %s", status, site)
     site.status = status
     site.claimed = False
     site.last_disclaimed = doublethink.utcnow()
     site.starts_and_stops[-1]["stop"] = doublethink.utcnow()
     site.save()
     if site.job_id:
         self._maybe_finish_job(site.job_id)
Пример #4
0
 def resume_job(self, job):
     job.status = "ACTIVE"
     job.starts_and_stops.append(
             {"start":doublethink.utcnow(), "stop":None})
     job.save()
     for site in self.job_sites(job.id):
         site.status = "ACTIVE"
         site.starts_and_stops.append(
                 {"start":doublethink.utcnow(), "stop":None})
         site.save()
Пример #5
0
 def resume_job(self, job):
     job.status = "ACTIVE"
     job.stop_requested = None
     job.starts_and_stops.append(
             {"start":doublethink.utcnow(), "stop":None})
     job.save()
     for site in self.job_sites(job.id):
         site.status = "ACTIVE"
         site.starts_and_stops.append(
                 {"start":doublethink.utcnow(), "stop":None})
         site.save()
Пример #6
0
 def resume_site(self, site):
     if site.job_id:
         # can't call resume_job since that would resume jobs's other sites
         job = brozzler.Job.load(self.rr, site.job_id)
         job.status = "ACTIVE"
         job.starts_and_stops.append(
                 {"start":doublethink.utcnow(), "stop":None})
         job.save()
     site.status = "ACTIVE"
     site.starts_and_stops.append(
             {"start":doublethink.utcnow(), "stop":None})
     site.save()
Пример #7
0
    def _proxy_request(self):
        warcprox_meta = None
        raw_warcprox_meta = self.headers.get('Warcprox-Meta')
        self.logger.trace('request for %s Warcprox-Meta header: %s', self.url,
                          raw_warcprox_meta)
        if raw_warcprox_meta:
            warcprox_meta = json.loads(raw_warcprox_meta)
            del self.headers['Warcprox-Meta']

        remote_ip = self._remote_server_conn.sock.getpeername()[0]
        timestamp = doublethink.utcnow()
        extra_response_headers = {}
        if warcprox_meta and 'accept' in warcprox_meta and \
                'capture-metadata' in warcprox_meta['accept']:
            rmeta = {
                'capture-metadata': {
                    'timestamp': timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
                }
            }
            extra_response_headers['Warcprox-Meta'] = json.dumps(
                rmeta, separators=',:')

        req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
            self, extra_response_headers=extra_response_headers)

        content_type = None
        try:
            content_type = prox_rec_res.headers.get('content-type')
        except AttributeError:  # py2
            raw = prox_rec_res.msg.getrawheader('content-type')
            if raw:
                content_type = raw.strip()

        recorded_url = RecordedUrl(url=self.url,
                                   request_data=req,
                                   response_recorder=prox_rec_res.recorder,
                                   remote_ip=remote_ip,
                                   warcprox_meta=warcprox_meta,
                                   status=prox_rec_res.status,
                                   size=prox_rec_res.recorder.len,
                                   client_ip=self.client_address[0],
                                   content_type=content_type,
                                   method=self.command,
                                   timestamp=timestamp,
                                   host=self.hostname,
                                   duration=doublethink.utcnow() - timestamp,
                                   referer=self.headers.get('referer'),
                                   payload_digest=prox_rec_res.payload_digest,
                                   truncated=prox_rec_res.truncated)
        self.server.recorded_url_q.put(recorded_url)

        return recorded_url
Пример #8
0
 def resume_site(self, site):
     if site.job_id:
         # can't call resume_job since that would resume jobs's other sites
         job = brozzler.Job.load(self.rr, site.job_id)
         job.status = "ACTIVE"
         site.stop_requested = None
         job.starts_and_stops.append(
                 {"start":doublethink.utcnow(), "stop":None})
         job.save()
     site.status = "ACTIVE"
     site.starts_and_stops.append(
             {"start":doublethink.utcnow(), "stop":None})
     site.save()
Пример #9
0
 def ydl_progress(*args, **kwargs):
     # in case youtube-dl takes a long time, heartbeat site.last_claimed
     # to prevent another brozzler-worker from claiming the site
     try:
         if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=7):
             self.logger.debug(
                     'heartbeating site.last_claimed to prevent another '
                     'brozzler-worker claiming this site id=%r', site.id)
             site.last_claimed = doublethink.utcnow()
             site.save()
     except:
         self.logger.debug(
                 'problem heartbeating site.last_claimed site id=%r',
                 site.id, exc_info=True)
Пример #10
0
 def maybe_heartbeat_site_last_claimed(*args, **kwargs):
     # in case yt-dlp takes a long time, heartbeat site.last_claimed
     # to prevent another brozzler-worker from claiming the site
     try:
         if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
             worker.logger.debug(
                     'heartbeating site.last_claimed to prevent another '
                     'brozzler-worker claiming this site id=%r', site.id)
             site.last_claimed = doublethink.utcnow()
             site.save()
     except:
         worker.logger.debug(
                 'problem heartbeating site.last_claimed site id=%r',
                 site.id, exc_info=True)
Пример #11
0
 def maybe_heartbeat_site_last_claimed(*args, **kwargs):
     # in case youtube-dl takes a long time, heartbeat site.last_claimed
     # to prevent another brozzler-worker from claiming the site
     try:
         if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
             worker.logger.debug(
                     'heartbeating site.last_claimed to prevent another '
                     'brozzler-worker claiming this site id=%r', site.id)
             site.last_claimed = doublethink.utcnow()
             site.save()
     except:
         worker.logger.debug(
                 'problem heartbeating site.last_claimed site id=%r',
                 site.id, exc_info=True)
Пример #12
0
    def honor_stop_request(self, site):
        """Raises brozzler.CrawlStopped if stop has been requested."""
        site.refresh()
        if (site.stop_requested
                and site.stop_requested <= doublethink.utcnow()):
            self.logger.info("stop requested for site %s", site.id)
            raise brozzler.CrawlStopped

        if site.job_id:
            job = brozzler.Job.load(self.rr, site.job_id)
            if (job and job.stop_requested
                    and job.stop_requested <= doublethink.utcnow()):
                self.logger.info("stop requested for job %s", site.job_id)
                raise brozzler.CrawlStopped
Пример #13
0
    def honor_stop_request(self, site):
        """Raises brozzler.CrawlStopped if stop has been requested."""
        site.refresh()
        if (site.stop_requested
                and site.stop_requested <= doublethink.utcnow()):
            self.logger.info("stop requested for site %s", site.id)
            raise brozzler.CrawlStopped

        if site.job_id:
            job = brozzler.Job.load(self.rr, site.job_id)
            if (job and job.stop_requested
                    and job.stop_requested <= doublethink.utcnow()):
                self.logger.info("stop requested for job %s", site.job_id)
                raise brozzler.CrawlStopped
Пример #14
0
    def send_error(self, code, message=None, explain=None, exception=None):
        super().send_error(code,
                           message=message,
                           explain=explain,
                           exception=exception)

        # If error happens during CONNECT handling and before the inner request, self.url
        # is unset, and self.path is something like 'example.com:443'
        urlish = self.url or self.path

        warcprox_meta = self._parse_warcprox_meta()
        self._swallow_hop_by_hop_headers()
        request_data = self._build_request()

        failed_url = FailedUrl(url=urlish,
                               request_data=request_data,
                               warcprox_meta=warcprox_meta,
                               status=code,
                               client_ip=self.client_address[0],
                               method=self.command,
                               timestamp=doublethink.utcnow(),
                               host=self.hostname,
                               duration=None,
                               referer=self.headers.get('referer'),
                               do_not_archive=True,
                               message=message,
                               exception=exception)

        self.server.recorded_url_q.put(failed_url)
Пример #15
0
def new_job(frontier, job_conf):
    '''Returns new Job.'''
    validate_conf(job_conf)
    job = Job(frontier.rr, {
                "conf": job_conf, "status": "ACTIVE",
                "started": doublethink.utcnow()})
    if "id" in job_conf:
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
        job.max_claimed_sites = job_conf["max_claimed_sites"]
    job.save()

    sites = []
    pages = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        merged_conf.pop("seeds")
        merged_conf["job_id"] = job.id
        merged_conf["seed"] = merged_conf.pop("url")
        site = brozzler.Site(frontier.rr, merged_conf)
        site.id = str(uuid.uuid4())
        sites.append(site)
        pages.append(new_seed_page(frontier, site))

    # insert in batches to avoid this error
    # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
    for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
        logging.info('inserting batch of %s pages', len(batch))
        result = frontier.rr.table('pages').insert(batch).run()
    for batch in (sites[i:i+100]  for i in range(0, len(sites), 100)):
        logging.info('inserting batch of %s sites', len(batch))
        result = frontier.rr.table('sites').insert(batch).run()
    logging.info('job %s fully started', job.id)

    return job
Пример #16
0
def new_job(frontier, job_conf):
    '''Returns new Job.'''
    validate_conf(job_conf)
    job = Job(frontier.rr, {
                "conf": job_conf, "status": "ACTIVE",
                "started": doublethink.utcnow()})
    if "id" in job_conf:
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
        job.max_claimed_sites = job_conf["max_claimed_sites"]
    job.save()

    sites = []
    pages = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        merged_conf.pop("seeds")
        merged_conf["job_id"] = job.id
        merged_conf["seed"] = merged_conf.pop("url")
        site = brozzler.Site(frontier.rr, merged_conf)
        site.id = str(uuid.uuid4())
        sites.append(site)
        pages.append(new_seed_page(frontier, site))

    # insert in batches to avoid this error
    # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
    for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
        logging.info('inserting batch of %s pages', len(batch))
        result = frontier.rr.table('pages').insert(batch).run()
    for batch in (sites[i:i+100]  for i in range(0, len(sites), 100)):
        logging.info('inserting batch of %s sites', len(batch))
        result = frontier.rr.table('sites').insert(batch).run()
    logging.info('job %s fully started', job.id)

    return job
Пример #17
0
def new_job(frontier, job_conf):
    '''Returns new Job.'''
    validate_conf(job_conf)
    job = Job(frontier.rr, {
                "conf": job_conf, "status": "ACTIVE",
                "started": doublethink.utcnow()})
    if "id" in job_conf:
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
        job.max_claimed_sites = job_conf["max_claimed_sites"]
    job.save()

    sites = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        merged_conf.pop("seeds")
        merged_conf["job_id"] = job.id
        merged_conf["seed"] = merged_conf.pop("url")
        site = brozzler.Site(frontier.rr, merged_conf)
        sites.append(site)

    for site in sites:
        new_site(frontier, site)

    return job
Пример #18
0
    def populate_defaults(self):
        if not "status" in self:
            self.status = "ACTIVE"
        if not "claimed" in self:
            self.claimed = False
        if not "last_disclaimed" in self:
            self.last_disclaimed = brozzler.EPOCH_UTC
        if not "last_claimed" in self:
            self.last_claimed = brozzler.EPOCH_UTC
        if not "scope" in self:
            self.scope = {}
        if not "surt" in self.scope and self.seed:
            self.scope["surt"] = brozzler.site_surt_canon(
                    self.seed).surt().decode('ascii')

        if not "starts_and_stops" in self:
            if self.get("start_time"):   # backward compatibility
                self.starts_and_stops = [{
                    "start":self.get("start_time"),"stop":None}]
                if self.get("status") != "ACTIVE":
                    self.starts_and_stops[0]["stop"] = self.last_disclaimed
                del self["start_time"]
            else:
                self.starts_and_stops = [
                        {"start":doublethink.utcnow(),"stop":None}]
Пример #19
0
    def postfetch_status(self):
        earliest = self.earliest_still_active_fetch_start()
        if earliest:
            seconds_behind = (doublethink.utcnow() - earliest).total_seconds()
        else:
            seconds_behind = 0
        result = {
            'earliest_still_active_fetch_start': earliest,
            'seconds_behind': seconds_behind,
            'postfetch_chain': []
        }
        for processor in self._postfetch_chain:
            if processor.__class__ == warcprox.ListenerPostfetchProcessor:
                name = processor.listener.__class__.__name__
            else:
                name = processor.__class__.__name__

            queued = len(processor.inq.queue)
            if hasattr(processor, 'batch'):
                queued += len(processor.batch)

            result['postfetch_chain'].append({
                'processor': name,
                'queued_urls': queued
            })
        return result
Пример #20
0
 def finish(self):
     if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
         self.logger.error(
                 "job is already finished status=%s "
                 "starts_and_stops[-1]['stop']=%s", self.status,
                 self.starts_and_stops[-1]["stop"])
     self.status = "FINISHED"
     self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
Пример #21
0
 def finish(self):
     if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
         self.logger.error(
                 "job is already finished status=%s "
                 "starts_and_stops[-1]['stop']=%s", self.status,
                 self.starts_and_stops[-1]["stop"])
     self.status = "FINISHED"
     self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
Пример #22
0
    def _proxy_request(self):
        warcprox_meta = None
        raw_warcprox_meta = self.headers.get('Warcprox-Meta')
        self.logger.trace(
                'request for %s Warcprox-Meta header: %s', self.url,
                raw_warcprox_meta)
        if raw_warcprox_meta:
            warcprox_meta = json.loads(raw_warcprox_meta)
            del self.headers['Warcprox-Meta']

        remote_ip = self._remote_server_conn.sock.getpeername()[0]
        timestamp = doublethink.utcnow()
        extra_response_headers = {}
        if warcprox_meta and 'accept' in warcprox_meta and \
                'capture-metadata' in warcprox_meta['accept']:
            rmeta = {'capture-metadata': {'timestamp': timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')}}
            extra_response_headers['Warcprox-Meta'] = json.dumps(rmeta, separators=',:')

        req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
                self, extra_response_headers=extra_response_headers)

        content_type = None
        try:
            content_type = prox_rec_res.headers.get('content-type')
        except AttributeError: # py2
            raw = prox_rec_res.msg.getrawheader('content-type')
            if raw:
                content_type = raw.strip()

        recorded_url = RecordedUrl(
                url=self.url, request_data=req,
                response_recorder=prox_rec_res.recorder, remote_ip=remote_ip,
                warcprox_meta=warcprox_meta, status=prox_rec_res.status,
                size=prox_rec_res.recorder.len,
                client_ip=self.client_address[0],
                content_type=content_type, method=self.command,
                timestamp=timestamp, host=self.hostname,
                duration=doublethink.utcnow()-timestamp,
                referer=self.headers.get('referer'),
                payload_digest=prox_rec_res.payload_digest,
                truncated=prox_rec_res.truncated)
        self.server.recorded_url_q.put(recorded_url)

        return recorded_url
Пример #23
0
def brozzler_stop_crawl(argv=None):
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    group = arg_parser.add_mutually_exclusive_group(required=True)
    add_rethinkdb_options(arg_parser)
    group.add_argument('--job',
                       dest='job_id',
                       metavar='JOB_ID',
                       help=('request crawl stop for the specified job'))
    group.add_argument('--site',
                       dest='site_id',
                       metavar='SITE_ID',
                       help=('request crawl stop for the specified site'))
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)
    if args.job_id:
        try:
            job_id = int(args.job_id)
        except ValueError:
            job_id = args.job_id
        job = brozzler.Job.load(rr, job_id)
        if not job:
            logging.fatal('job not found with id=%r', job_id)
            sys.exit(1)
        job.stop_requested = doublethink.utcnow()
        job.save()
    elif args.site_id:
        try:
            site_id = int(args.site_id)
        except ValueError:
            site_id = args.site_id
        site = brozzler.Site.load(rr, site_id)
        if not site:
            logging.fatal('site not found with id=%r', site_id)
            sys.exit(1)
        site.stop_requested = doublethink.utcnow()
        site.save()
Пример #24
0
 def process_request(self, request, client_address):
     self.active_requests[request] = doublethink.utcnow()
     future = self.pool.submit(self.process_request_thread, request,
                               client_address)
     future.add_done_callback(
         lambda f: self.active_requests.pop(request, None))
     if future.done():
         # avoid theoretical timing issue, in case process_request_thread
         # managed to finish before future.add_done_callback() ran
         self.active_requests.pop(request, None)
Пример #25
0
def test_utcnow():
    now_notz = datetime.datetime.utcnow()  # has no timezone :(
    assert not now_notz.tzinfo

    now_tz = doublethink.utcnow()  # solution to that problem
    assert now_tz.tzinfo

    ## .timestamp() was added in python 3.3
    if hasattr(now_tz, 'timestamp'):
        assert now_tz.timestamp() - now_notz.timestamp() < 0.1
Пример #26
0
 def process_request(self, request, client_address):
     self.active_requests[request] = doublethink.utcnow()
     future = self.pool.submit(
             self.process_request_thread, request, client_address)
     future.add_done_callback(
             lambda f: self.active_requests.pop(request, None))
     if future.done():
         # avoid theoretical timing issue, in case process_request_thread
         # managed to finish before future.add_done_callback() ran
         self.active_requests.pop(request, None)
Пример #27
0
 def disclaim_site(self, site, page=None):
     self.logger.info("disclaiming %s", site)
     site.claimed = False
     site.last_disclaimed = doublethink.utcnow()
     if not page and not self.has_outstanding_pages(site):
         self.finished(site, "FINISHED")
     else:
         site.save()
     if page:
         page.claimed = False
         page.save()
Пример #28
0
 def elapsed(self):
     '''Returns elapsed crawl time as a float in seconds.'''
     dt = 0
     for ss in self.starts_and_stops[:-1]:
         dt += (ss['stop'] - ss['start']).total_seconds()
     ss = self.starts_and_stops[-1]
     if ss['stop']:
         dt += (ss['stop'] - ss['start']).total_seconds()
     else:  # crawl is active
         dt += (doublethink.utcnow() - ss['start']).total_seconds()
     return dt
Пример #29
0
 def disclaim_site(self, site, page=None):
     self.logger.info("disclaiming %s", site)
     site.claimed = False
     site.last_disclaimed = doublethink.utcnow()
     if not page and not self.has_outstanding_pages(site):
         self.finished(site, "FINISHED")
     else:
         site.save()
     if page:
         page.claimed = False
         page.save()
Пример #30
0
def brozzler_stop_crawl(argv=None):
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    group = arg_parser.add_mutually_exclusive_group(required=True)
    add_rethinkdb_options(arg_parser)
    group.add_argument(
            '--job', dest='job_id', metavar='JOB_ID', help=(
                'request crawl stop for the specified job'))
    group.add_argument(
            '--site', dest='site_id', metavar='SITE_ID', help=(
                'request crawl stop for the specified site'))
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)
    if args.job_id:
        try:
            job_id = int(args.job_id)
        except ValueError:
            job_id = args.job_id
        job = brozzler.Job.load(rr, job_id)
        if not job:
            logging.fatal('job not found with id=%r', job_id)
            sys.exit(1)
        job.stop_requested = doublethink.utcnow()
        job.save()
    elif args.site_id:
        try:
            site_id = int(args.site_id)
        except ValueError:
            site_id = args.site_id
        site = brozzler.Site.load(rr, site_id)
        if not site:
            logging.fatal('site not found with id=%r', site_id)
            sys.exit(1)
        site.stop_requested = doublethink.utcnow()
        site.save()
Пример #31
0
def test_claim_site():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    rr.table('sites').delete().run()  # clean slate

    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id
    assert claimed_site.claimed
    assert claimed_site.last_claimed >= doublethink.utcnow(
    ) - datetime.timedelta(minutes=1)
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed less than 1 hour ago still not to be reclaimed
    claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(
        minutes=55)
    claimed_site.save()
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed more than 1 hour ago can be reclaimed
    site = claimed_site
    claimed_site = None
    site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
    site.save()
    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id

    # clean up
    rr.table('sites').get(claimed_site.id).delete().run()
Пример #32
0
    def _service_heartbeat_if_due(self):
        '''Sends service registry heartbeat if due'''
        due = False
        if self._service_registry:
            if not hasattr(self, "status_info"):
                due = True
            else:
                d = doublethink.utcnow() - self.status_info["last_heartbeat"]
                due = d.total_seconds() > self.HEARTBEAT_INTERVAL

        if due:
            self._service_heartbeat()
Пример #33
0
    def __init__(
            self, stats_db=None, status_callback=None,
            options=warcprox.Options()):
        self.start_time = doublethink.utcnow()

        warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
                self, WarcProxyHandler, options)

        self.status_callback = status_callback
        self.stats_db = stats_db
        self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
        self.running_stats = warcprox.stats.RunningStats()
Пример #34
0
    def __init__(
            self, stats_db=None, status_callback=None,
            options=warcprox.Options()):
        self.start_time = doublethink.utcnow()

        warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
                self, WarcProxyHandler, options)

        self.status_callback = status_callback
        self.stats_db = stats_db
        self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
        self.running_stats = warcprox.stats.RunningStats()
Пример #35
0
    def _service_heartbeat_if_due(self):
        '''Sends service registry heartbeat if due'''
        due = False
        if self._service_registry:
            if not hasattr(self, "status_info"):
                due = True
            else:
                d = doublethink.utcnow() - self.status_info["last_heartbeat"]
                due = d.total_seconds() > self.HEARTBEAT_INTERVAL

        if due:
            self._service_heartbeat()
Пример #36
0
def test_claim_site():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    rr.table('sites').delete().run() # clean slate

    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id
    assert claimed_site.claimed
    assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1)
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed less than 1 hour ago still not to be reclaimed
    claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55)
    claimed_site.save()
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed more than 1 hour ago can be reclaimed
    site = claimed_site
    claimed_site = None
    site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
    site.save()
    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id

    # clean up
    rr.table('sites').get(claimed_site.id).delete().run()
Пример #37
0
 def populate_defaults(self):
     if not "status" in self:
         self.status = "ACTIVE"
     if not "starts_and_stops" in self:
         if self.get("started"):   # backward compatibility
             self.starts_and_stops = [{
                 "start": self.get("started"),
                 "stop": self.get("finished")}]
             del self["started"]
             if "finished" in self:
                 del self["finished"]
         else:
             self.starts_and_stops = [
                     {"start":doublethink.utcnow(),"stop":None}]
Пример #38
0
 def populate_defaults(self):
     if not "status" in self:
         self.status = "ACTIVE"
     if not "starts_and_stops" in self:
         if self.get("started"):   # backward compatibility
             self.starts_and_stops = [{
                 "start": self.get("started"),
                 "stop": self.get("finished")}]
             del self["started"]
             if "finished" in self:
                 del self["finished"]
         else:
             self.starts_and_stops = [
                     {"start":doublethink.utcnow(),"stop":None}]
Пример #39
0
    def elapsed(self):
        '''
        Returns elapsed crawl time as a float in seconds.

        This metric includes all the time that a site was in active rotation,
        including any time it spent waiting for its turn to be brozzled.

        In contrast `Site.active_brozzling_time` only counts time when a
        brozzler worker claimed the site and was actively brozzling it.
        '''
        dt = 0
        for ss in self.starts_and_stops[:-1]:
            if ss['stop']:
                dt += (ss['stop'] - ss['start']).total_seconds()
            else:
                self.logger.warning("missing expected ss['stop']")
                dt += (doublethink.utcnow() - ss['start']).total_seconds()
        ss = self.starts_and_stops[-1]
        if ss['stop']:
            dt += (ss['stop'] - ss['start']).total_seconds()
        else:  # crawl is active
            dt += (doublethink.utcnow() - ss['start']).total_seconds()
        return dt
Пример #40
0
    def __init__(self,
                 stats_db=None,
                 status_callback=None,
                 options=warcprox.Options()):
        self.start_time = doublethink.utcnow()
        self.status_callback = status_callback
        self.stats_db = stats_db
        self.options = options
        self.remote_connection_pool = PoolManager(
            num_pools=max(round(options.max_threads /
                                6), 200) if options.max_threads else 200)
        server_address = (options.address or 'localhost',
                          options.port if options.port is not None else 8000)

        if options.onion_tor_socks_proxy:
            try:
                host, port = options.onion_tor_socks_proxy.split(':')
                WarcProxyHandler.onion_tor_socks_proxy_host = host
                WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
            except ValueError:
                WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
                WarcProxyHandler.onion_tor_socks_proxy_port = None

        if options.socket_timeout:
            WarcProxyHandler._socket_timeout = options.socket_timeout
        if options.max_resource_size:
            WarcProxyHandler._max_resource_size = options.max_resource_size
        if options.tmp_file_max_memory_size:
            WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size

        http_server.HTTPServer.__init__(self,
                                        server_address,
                                        WarcProxyHandler,
                                        bind_and_activate=True)

        self.digest_algorithm = options.digest_algorithm or 'sha1'

        ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
        self.ca = CertificateAuthority(ca_file=options.cacert
                                       or 'warcprox-ca.pem',
                                       certs_dir=options.certs_dir
                                       or './warcprox-ca',
                                       ca_name=ca_name)

        self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)

        self.running_stats = warcprox.stats.RunningStats()
Пример #41
0
    def populate_defaults(self):
        if not "status" in self:
            self.status = "ACTIVE"
        if not "claimed" in self:
            self.claimed = False
        if not "last_disclaimed" in self:
            self.last_disclaimed = brozzler.EPOCH_UTC
        if not "last_claimed" in self:
            self.last_claimed = brozzler.EPOCH_UTC
        if not "scope" in self:
            self.scope = {}

        # backward compatibility
        if "surt" in self.scope:
            if not "accepts" in self.scope:
                self.scope["accepts"] = []
            self.scope["accepts"].append({"surt": self.scope["surt"]})
            del self.scope["surt"]

        # backward compatibility
        if ("max_hops_off_surt" in self.scope
                and not "max_hops_off" in self.scope):
            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
        if "max_hops_off_surt" in self.scope:
            del self.scope["max_hops_off_surt"]

        if self.seed:
            self._accept_ssurt_if_not_redundant(
                brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))

        if not "starts_and_stops" in self:
            if self.get("start_time"):  # backward compatibility
                self.starts_and_stops = [{
                    "start": self.get("start_time"),
                    "stop": None
                }]
                if self.get("status") != "ACTIVE":
                    self.starts_and_stops[0]["stop"] = self.last_disclaimed
                del self["start_time"]
            else:
                self.starts_and_stops = [{
                    "start": doublethink.utcnow(),
                    "stop": None
                }]
Пример #42
0
    def elapsed(self):
        '''
        Returns elapsed crawl time as a float in seconds.

        This metric includes all the time that a site was in active rotation,
        including any time it spent waiting for its turn to be brozzled.

        In contrast `Site.active_brozzling_time` only counts time when a
        brozzler worker claimed the site and was actively brozzling it.
        '''
        dt = 0
        for ss in self.starts_and_stops[:-1]:
            dt += (ss['stop'] - ss['start']).total_seconds()
        ss = self.starts_and_stops[-1]
        if ss['stop']:
            dt += (ss['stop'] - ss['start']).total_seconds()
        else: # crawl is active
            dt += (doublethink.utcnow() - ss['start']).total_seconds()
        return dt
Пример #43
0
def test_honor_stop_request():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # 1. test stop request on job
    job_conf = {'seeds': [{'url': 'http://example.com'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    assert site.job_id == job.id

    # does not raise exception
    frontier.honor_stop_request(site)

    # set job.stop_requested
    job.stop_requested = datetime.datetime.utcnow().replace(
            tzinfo=doublethink.UTC)
    job.save()
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site)

    # 2. test stop request on site
    job_conf = {'seeds': [{'url': 'http://example.com'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    assert site.job_id == job.id

    # does not raise exception
    frontier.honor_stop_request(site)

    # set site.stop_requested
    site.stop_requested = doublethink.utcnow()
    site.save()
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site)
Пример #44
0
def test_honor_stop_request():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # 1. test stop request on job
    job_conf = {'seeds': [{'url': 'http://example.com'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    assert site.job_id == job.id

    # does not raise exception
    frontier.honor_stop_request(site)

    # set job.stop_requested
    job.stop_requested = datetime.datetime.utcnow().replace(
            tzinfo=doublethink.UTC)
    job.save()
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site)

    # 2. test stop request on site
    job_conf = {'seeds': [{'url': 'http://example.com'}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    assert site.job_id == job.id

    # does not raise exception
    frontier.honor_stop_request(site)

    # set site.stop_requested
    site.stop_requested = doublethink.utcnow()
    site.save()
    with pytest.raises(brozzler.CrawlStopped):
        frontier.honor_stop_request(site)
Пример #45
0
    def populate_defaults(self):
        if not "status" in self:
            self.status = "ACTIVE"
        if not "claimed" in self:
            self.claimed = False
        if not "last_disclaimed" in self:
            self.last_disclaimed = brozzler.EPOCH_UTC
        if not "last_claimed" in self:
            self.last_claimed = brozzler.EPOCH_UTC
        if not "scope" in self:
            self.scope = {}

        # backward compatibility
        if "surt" in self.scope:
            if not "accepts" in self.scope:
                self.scope["accepts"] = []
            self.scope["accepts"].append({"surt": self.scope["surt"]})
            del self.scope["surt"]

        # backward compatibility
        if ("max_hops_off_surt" in self.scope
                and not "max_hops_off" in self.scope):
            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
        if "max_hops_off_surt" in self.scope:
            del self.scope["max_hops_off_surt"]

        if self.seed:
            self._accept_ssurt_if_not_redundant(
                    brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))

        if not "starts_and_stops" in self:
            if self.get("start_time"):   # backward compatibility
                self.starts_and_stops = [{
                    "start":self.get("start_time"),"stop":None}]
                if self.get("status") != "ACTIVE":
                    self.starts_and_stops[0]["stop"] = self.last_disclaimed
                del self["start_time"]
            else:
                self.starts_and_stops = [
                        {"start":doublethink.utcnow(),"stop":None}]
Пример #46
0
 def claim_site(self, worker_id):
     # XXX keep track of aggregate priority and prioritize sites accordingly?
     while True:
         result = (
             self.rr.table("sites", read_mode="majority").between(
                 ["ACTIVE", r.minval], ["ACTIVE", r.maxval],
                 index="sites_last_disclaimed").order_by(
                     index="sites_last_disclaimed").filter(
                         (r.row["claimed"] != True)
                         | (r.row["last_claimed"] < r.now() - 2 * 60 * 60)).
             limit(1).update(
                 # try to avoid a race condition resulting in multiple
                 # brozzler-workers claiming the same site
                 # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
                 r.branch(
                     (r.row["claimed"] != True) |
                     (r.row["last_claimed"] < r.now() - 2 * 60 * 60), {
                         "claimed": True,
                         "last_claimed_by": worker_id,
                         "last_claimed": doublethink.utcnow()
                     }, {}),
                 return_changes=True)).run()
         self._vet_result(result, replaced=[0, 1], unchanged=[0, 1])
         if result["replaced"] == 1:
             if result["changes"][0]["old_val"]["claimed"]:
                 self.logger.warn(
                     "re-claimed site that was still marked 'claimed' "
                     "because it was last claimed a long time ago "
                     "at %s, and presumably some error stopped it from "
                     "being disclaimed",
                     result["changes"][0]["old_val"]["last_claimed"])
             site = brozzler.Site(self.rr, result["changes"][0]["new_val"])
         else:
             raise brozzler.NothingToClaim
         # XXX This is the only place we enforce time limit for now. Worker
         # loop should probably check time limit. Maybe frontier needs a
         # housekeeping thread to ensure that time limits get enforced in a
         # timely fashion.
         if not self._enforce_time_limit(site):
             return site
Пример #47
0
    def postfetch_status(self):
        earliest = self.earliest_still_active_fetch_start()
        if earliest:
            seconds_behind = (doublethink.utcnow() - earliest).total_seconds()
        else:
            seconds_behind = 0
        result = {
            'earliest_still_active_fetch_start': earliest,
            'seconds_behind': seconds_behind,
            'postfetch_chain': []
        }
        for processor in self._postfetch_chain:
            if processor.__class__ == warcprox.ListenerPostfetchProcessor:
                name = processor.listener.__class__.__name__
            else:
                name = processor.__class__.__name__

            queued = len(processor.inq.queue)
            if hasattr(processor, 'batch'):
                queued += len(processor.batch)

            result['postfetch_chain'].append({
                'processor': name, 'queued_urls': queued})
        return result
Пример #48
0
    def unique_service(self, role, candidate=None):
        '''
        Retrieve a unique service, possibly setting or heartbeating it first.

        A "unique service" is a service with only one instance for a given
        role. Uniqueness is enforced by using the role name as the primary key
        `{'id':role, ...}`.

        Args:
            role (str): role name
            candidate (dict): if supplied, candidate info for the unique
                service, explained below

        `candidate` normally represents "myself, this instance of the service".
        When a service supplies `candidate`, it is nominating itself for
        selection as the unique service, or retaining its claim to the role
        (heartbeating).

        If `candidate` is supplied:

            First, atomically in a single rethinkdb query, checks if there is
            already a unique healthy instance of this service in rethinkdb, and
            if not, sets `candidate` as the unique service.

            Looks at the result of that query to determine if `candidate` is
            the unique service or not. If it is, updates 'last_heartbeat' in
            rethinkdb.

            To determine whether `candidate` is the unique service, checks that
            all the fields other than 'first_heartbeat' and 'last_heartbeat'
            have the same value in `candidate` as in the value returned from
            rethinkdb.

            ***Important***: this means that the caller must ensure that none
            of the fields of the unique service ever change. Don't store things
            like 'load' or any other volatile value in there. If you try to do
            that, heartbeats will end up not being sent, and the unique service
            will flap among the candidates.

        Finally, retrieves the service from rethinkdb and returns it, if it is
        healthy.

        Returns:
            the unique service, if there is one and it is healthy, otherwise
            None
        '''
        # use the same concept of 'now' for all queries
        now = doublethink.utcnow()
        if candidate is not None:
            candidate['id'] = role

            if not 'ttl' in candidate:
                raise Exception("candidate is missing required field 'ttl'")
            val = candidate['ttl']
            if not (isinstance(val, float)
                    or isinstance(val, int)) or val <= 0:
                raise Exception("'ttl' must be a number > 0")

            candidate['first_heartbeat'] = now
            candidate['last_heartbeat'] = now
            if not 'host' in candidate:
                candidate['host'] = socket.gethostname()
            if not 'pid' in candidate:
                candidate['pid'] = os.getpid()

            result = self.rr.table(
                'services', read_mode='majority').get(role).replace(
                    lambda row: r.branch(
                        r.branch(row, row['last_heartbeat'] > now - row['ttl'],
                                 False), row, candidate),
                    return_changes='always').run()
            new_val = result['changes'][0]['new_val']
            if all([
                    new_val.get(k) == candidate[k] for k in candidate
                    if k not in ('first_heartbeat', 'last_heartbeat')
            ]):
                # candidate is the unique_service, send a heartbeat
                del candidate['first_heartbeat']  # don't touch first_heartbeat
                self.rr.table('services').get(role).update(candidate).run()

        results = list(
            self.rr.table('services', read_mode='majority').get_all(role).
            filter(lambda row: row['last_heartbeat'] > now - row['ttl']).run())
        if results:
            return results[0]
        else:
            return None
Пример #49
0
    def do_WARCPROX_WRITE_RECORD(self, warc_type=None):
        '''
        Handles a request with http method WARCPROX_WRITE_RECORD, a special
        type of request which tells warcprox to construct a warc record from
        the request more or less verbatim, and write it to a warc.

        To honor the request, this method creates a RecordedUrl queues it for
        the WarcWriterThread to process. The warc record headers Content-Type
        and WARC-Type are taken from the request headers, as is the payload.

        Example request:

        WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1
        WARC-Type: metadata
        Content-Type: image/png
        Content-Length: 12345
        Connection: close

        <png image data>
        '''
        try:
            self.url = self.path
            self._enforce_limits_and_blocks()

            if ('Content-Length' in self.headers and 'Content-Type' in self.headers
                    and (warc_type or 'WARC-Type' in self.headers)):
                timestamp = doublethink.utcnow()

                request_data = tempfile.SpooledTemporaryFile(
                        max_size=self._tmp_file_max_memory_size)
                payload_digest = hashlib.new(self.server.digest_algorithm)

                # XXX we don't support chunked uploads for now
                length = int(self.headers['Content-Length'])
                buf = self.rfile.read(min(65536, length - request_data.tell()))
                while buf != b'':
                    request_data.write(buf)
                    payload_digest.update(buf)
                    buf = self.rfile.read(
                            min(65536, length - request_data.tell()))

                warcprox_meta = None
                raw_warcprox_meta = self.headers.get('Warcprox-Meta')
                if raw_warcprox_meta:
                    warcprox_meta = json.loads(raw_warcprox_meta)

                rec_custom = RecordedUrl(
                        url=self.url,
                        request_data=request_data,
                        response_recorder=None,
                        remote_ip=b'',
                        warcprox_meta=warcprox_meta,
                        content_type=self.headers['Content-Type'],
                        custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
                        status=204,
                        size=request_data.tell(),
                        client_ip=self.client_address[0],
                        method=self.command,
                        timestamp=timestamp,
                        duration=doublethink.utcnow()-timestamp,
                        payload_digest=payload_digest)
                request_data.seek(0)

                self.server.recorded_url_q.put(rec_custom)
                self.send_response(204, 'OK')
            else:
                self.send_error(400, message='Bad request', explain=(
                    'Bad request. WARC-Type, Content-Length, and Content-Type '
                    'request headers required for WARCPROX_WRITE_RECORD '
                    'request.'))

            self.end_headers()
        except warcprox.RequestBlockedByRule as e:
            # limit enforcers have already sent the appropriate response
            self.logger.info("%r: %r", self.requestline, e)
            return
        except:
            self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
            raise
Пример #50
0
def test_choose_warcprox():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    svcreg = doublethink.ServiceRegistry(rr)
    frontier = brozzler.RethinkDbFrontier(rr)

    # avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
    rr.table('sites').wait().run()
    rr.table('services').wait().run()
    rr.table('sites').index_wait().run()
    rr.table('services').index_wait().run()

    # clean slate
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
    worker = brozzler.BrozzlerWorker(frontier, svcreg)
    assert worker._choose_warcprox() is None

    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host1', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8001,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host3', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host4', 'port': 8000,
        'load': 1, 'ttl': 60}).run()

    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8001', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host3'
    assert instance['port'] == 8000
    rr.table('sites').insert({
        'proxy': 'host3:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host4'
    assert instance['port'] == 8000

    # clean up
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
Пример #51
0
def test_choose_warcprox():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    svcreg = doublethink.ServiceRegistry(rr)
    frontier = brozzler.RethinkDbFrontier(rr)

    # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
    rr.table('sites').wait().run()
    rr.table('services').wait().run()
    rr.table('sites').index_wait().run()
    rr.table('services').index_wait().run()

    # clean slate
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
    worker = brozzler.BrozzlerWorker(frontier, svcreg)
    assert worker._choose_warcprox() is None

    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host1', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8001,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host3', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host4', 'port': 8000,
        'load': 1, 'ttl': 60}).run()

    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8001', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host3'
    assert instance['port'] == 8000
    rr.table('sites').insert({
        'proxy': 'host3:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host4'
    assert instance['port'] == 8000

    # clean up
    rr.table('sites').delete().run()
    rr.table('services').delete().run()