Exemplo n.º 1
0
    def __init__(self, job, job_name, server_level, downstream_job=None):
        if server_level == 0:
            assert downstream_job is None
        else:
            assert downstream_job is not None

        self.job = job
        self.downstream_job = downstream_job
        self.master = None
        self.server_level = server_level

        Server.num_servers.setdefault(job_name, 0)
        Server.num_servers[job_name] += 1
        self.server_id = '%s:%d' % (job_name, Server.num_servers[job_name])

        self.state = ServerStateWrapper(self)

        # Kick off the pseudo-thread to do discovery and get resource
        # capacity.
        scheduler.add_thread(self, 0)
Exemplo n.º 2
0
  def __init__(self, job, job_name, server_level, downstream_job=None):
    if server_level == 0:
      assert downstream_job is None
    else:
      assert downstream_job is not None

    self.job = job
    self.downstream_job = downstream_job
    self.master = None
    self.server_level = server_level

    Server.num_servers.setdefault(job_name, 0)
    Server.num_servers[job_name] += 1
    self.server_id = '%s:%d' % (job_name, Server.num_servers[job_name])

    self.state = ServerStateWrapper(self)

    # Kick off the pseudo-thread to do discovery and get resource
    # capacity.
    scheduler.add_thread(self, 0)
Exemplo n.º 3
0
class Server(object):
    # Used to generate server identifiers.
    num_servers = dict()

    # Constructor.
    def __init__(self, job, job_name, server_level, downstream_job=None):
        if server_level == 0:
            assert downstream_job is None
        else:
            assert downstream_job is not None

        self.job = job
        self.downstream_job = downstream_job
        self.master = None
        self.server_level = server_level

        Server.num_servers.setdefault(job_name, 0)
        Server.num_servers[job_name] += 1
        self.server_id = '%s:%d' % (job_name, Server.num_servers[job_name])

        self.state = ServerStateWrapper(self)

        # Kick off the pseudo-thread to do discovery and get resource
        # capacity.
        scheduler.add_thread(self, 0)

    def get_server_id(self):
        return self.server_id

    def get_server_level(self):
        return self.server_level

    def is_master(self):
        return self.state.get_election_victory_time() != None

    # Tells this server that it is no longer the master. This will reset
    # the internal state.
    def lose_mastership(self):
        assert self.is_master()

        logger.info('%s losing mastership' % self.server_id)
        self.state.reset()

    # Tells this server that it has become the master (as result of a
    # master election having been triggered).
    def become_master(self):
        assert not self.is_master()

        logger.info('%s becoming master' % self.server_id)
        self.state.assert_clean()
        self.state.set_election_victory_time()

        # Wake up the thread that does discovery and getting capacity.
        scheduler.update_thread(self, 0)

    # Returns the reporting data for this server. Just delegates to the
    # wrapped state object.
    def get_reporting_data(self, resource_id):
        assert self.is_master()

        return self.state.get_reporting_data(resource_id)

    # Sends a Discovery RPC to a random task in the server job.
    # This differs from the _discover method in the client code that
    # here we are not interested in the safe capacities. Returns a
    # reference to the server tasks that is the master, or None
    # if we did not find one.
    def _discover(self):
        assert self.server_level > 0

        request = DiscoveryRequest()
        request.client_id = self.server_id

        # Sends the request to a random task in the server job.
        response = self.downstream_job.get_random_task().Discovery_RPC(request)

        # If the response has a master_bns field we store the reference
        # to the master. If not there is no master, which would suck.
        if response.HasField('master_bns'):
            self.master = self.downstream_job.get_task_by_name(
                response.master_bns)
        else:
            self.master = None
            logger.warning('%s doesn\'t know who the master is.' %
                           self.server_id)
            Counter.get('server.discovery_failure').inc()

        return self.master

    # Implements the Discovery RPC.
    def Discovery_RPC(self, request):
        assert request.IsInitialized()

        timer = Gauge.get('server.DiscoveryRPC.latency')
        timer.start_timer()
        logger.info('%s handling Discovery RPC from %s' %
                    (self.server_id, request.client_id))
        response = DiscoveryResponse()

        # Sets the master_bns field in the response if there is a current
        # master.
        master = self.job.get_master()

        if master:
            response.master_bns = master.get_server_id()
        else:
            # We don't know who the master is.
            Counter.get('server.incomplete_discovery_response').inc()

        # Goes through the resource ids in the request and sets the
        # safe capacity for every resource that has a safe capacity
        # configured.
        for r in request.resource_id:
            t = global_config.find_resource_template(r)

            if t and t.HasField('safe_capacity'):
                safe = response.safe_capacity.add()
                safe.resource_id = r
                safe.safe_capacity = t.safe_capacity

        assert response.IsInitialized()

        timer.stop_timer()

        return response

    # Figured out when to execute the next _get_capacity call. The interval is determined by the
    # refresh_interval settings of the resources in the state.
    def _renew_capacity_interval(self):
        # Figures out the smallest refresh_interval in the server state.
        delay = sys.maxint

        for resource in self.state.all_resources():
            if resource.HasField('has'):
                delay = min(delay, resource.has.refresh_interval)

        # If that delay is highly improbable we have some error and we use
        # a default delay. This might for instance happen if all resources
        # have lost their (or never gotten any) leases.
        if delay <= 0 or delay == sys.maxint:
            logger.error('%s improbable delay %d, set to %d instead' %
                         (self.server_id, delay, _kDefaultRefreshInterval))
            delay = _kDefaultRefreshInterval
            Counter.get('server.improbable.delay').inc()

        return delay

    # Get some capacity from the master downstream server.
    def _get_capacity_downstream(self):
        response = self.master.GetServerCapacity_RPC(
            self.state.fill_server_capacity_request())

        # Did the RPC fail?
        if not response:
            return False

        # Work the response into the state.
        self.state.process_capacity_response(response)

        return True

    # Get some capacity for this server to hand out. Returns a boolean to
    # indicate whether this succeeded or failed.
    def _get_capacity(self):
        assert self.is_master()

        now = clock.get_time()

        # Assume the worst... :-)
        success = False

        # If we are server level 0, we need to get the capacity from the
        # configuration.
        if self.server_level == 0:
            for resource in self.state.all_resources():
                algo = AlgorithmImpl.create(resource.template,
                                            self.server_level)
                resource.ClearField('has')
                resource.has.CopyFrom(
                    algo.create_lease(resource, resource.template.capacity))

                # Note, we set a refresh interval here even though the capacity we get from the
                # configuration lasts forever. However by setting a refresh interval and relatively
                # short leases we ensure that configuration changes (e.g. from CDD) are
                # picked up.
                resource.has.refresh_interval *= 2

            success = True
        else:
            # If this is not the root server it gets its capacity from
            # a downstream server.
            success = self._get_capacity_downstream()

        logger.info('%s resource state after getting capacity:' %
                    self.server_id)

        for resource in self.state.all_resources():
            logger.info('resource: %s got: %lf lease: %d refresh: %d' %
                        (resource.resource_id, resource.has.capacity,
                         resource.has.expiry_time - now,
                         resource.has.refresh_interval))

        return success

    # Implements the GetServerCapacity RPC.
    def GetServerCapacity_RPC(self, request):
        assert request.IsInitialized()
        assert self.state.is_initialized()

        # Only the master can handle this RPC.
        if not self.is_master():
            self.state.assert_clean()
            logger.info(
                '%s getting a GetServerCapacity request when not master' %
                self.server_id)

            Counter.get('server.GetServerCapacity_RPC.not_master').inc()

            return None

        gauge = Gauge.get('server.GetServerCapacity_RPC.latency')
        gauge.start_timer()
        logger.debug(request)
        now = clock.get_time()

        # Cleans the state. This removes resources and clients with expired
        # leases and such.
        self.state.cleanup()

        # A set of resources that we need to skip in step 2 (the actual
        # handing out of capacity.
        resources_to_skip = set()

        # First step: Go through the request and update the state with the
        # information from the request.
        for req in request.resource:
            (resource,
             sr) = (self.state.find_server_resource(request.server_id,
                                                    req.resource_id))

            # If this resource does not exist we don't need to do anything right now.
            if resource:
                assert sr

                # Checks whether the last request from this server was at least
                # _kMinimumInterval seconds ago.
                if sr.HasField(
                        'last_request_time'
                ) and now - sr.last_request_time < _kMinimumInterval:
                    logger.warning(
                        '%s GetServerCapacity request for resource %s within the %d '
                        'second threshold' %
                        (self.server_id, req.resource_id, _kMinimumInterval))
                    resources_to_skip.add(req.resource_id)
                else:
                    # Updates the state with the information in the request.
                    sr.last_request_time = now
                    sr.outstanding = req.outstanding
                    del sr.wants[:]

                    for w in req.wants:
                        sr.wants.add().CopyFrom(w)

                    if req.HasField('has'):
                        sr.has.CopyFrom(req.has)
                    else:
                        sr.ClearField('has')

        # Creates a new response object in which we will insert the response for
        # the resources contained in the request.
        response = GetServerCapacityResponse()

        # Step 2: Loop through all the individual resource requests in the request
        # and hand out capacity.
        for req in request.resource:
            # If this is a resource we need to skip, let's skip it.
            if req.resource_id in resources_to_skip:
                continue

            # Finds the resource and the client state for this resource.
            (resource,
             sr) = (self.state.find_server_resource(request.server_id,
                                                    req.resource_id))

            # Adds a response proto to the overall response.
            resp = response.resource.add()
            resp.resource_id = req.resource_id

            # If this is an unknown resource just give the client whatever it
            # is asking for.
            if not resource:
                assert not sr

                logger.warning(
                    '%s GetServerCapacity request for unmanaged resource %s' %
                    (self.server_id, req.resource_id))
                resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources
                resp.gets.capacity = req.wants
            else:
                # Finds the algorithm implementation object for this resource.
                algo = AlgorithmImpl.create(resource.template,
                                            self.server_level)

                # If the resource is in learning mode we just return whatever the client
                # has now and create a default lease.
                if resource.learning_mode_expiry_time >= now:
                    if sr.HasField('has'):
                        has_now = sr.has.capacity
                    else:
                        has_now = 0

                    sr.has.CopyFrom(algo.create_lease(resource, has_now))
                else:
                    # Otherwise we just run the algorithm. This will update the
                    # client state object.
                    algo.run_server(resource, sr)

                # Copies the output from the algorithm run into the response.
                resp.gets.CopyFrom(sr.has)

            assert resp.IsInitialized()
            logger.info(
                '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d'
                % (self.server_id, request.server_id, req.resource_id,
                   sum([w.wants for w in req.wants]), resp.gets.capacity,
                   resp.gets.expiry_time - now, resp.gets.refresh_interval))

        assert response.IsInitialized()

        gauge.stop_timer()

        return response

    # Implements the GetCapacity RPC.
    def GetCapacity_RPC(self, request):
        assert request.IsInitialized()
        assert self.state.is_initialized()

        # If this server is not the master it cannot handle this request.
        # The client should do a new Discovery.
        if not self.is_master():
            self.state.assert_clean()
            logger.info('%s getting a GetCapacity request when not master' %
                        self.server_id)
            Counter.get('server.GetCapacity_RPC.not_master').inc()

            return None

        timer = Gauge.get('server.GetCapacity_RPC.latency')
        timer.start_timer()
        logger.debug(request)
        now = clock.get_time()

        # Cleanup the state. This removes resources and clients with expired
        # leases and such.
        self.state.cleanup()

        # A set of resources that we need to skip in step 2 (the actual
        # handing out of capacity.
        resources_to_skip = set()

        # First step: Go through the request and update the state with the
        # information from the request.
        for req in request.resource:
            # Finds the resource and the client state for this resource.
            (resource,
             cr) = self.state.find_client_resource(request.client_id,
                                                   req.resource_id)

            # If this resource does not exist we don't need to do anything
            # right now.
            if resource:
                assert cr

                # Checks whether the last request from this client was at least
                # _kMinimumInterval seconds ago.
                if cr.HasField(
                        'last_request_time'
                ) and now - cr.last_request_time < _kMinimumInterval:
                    logger.warning(
                        '%s GetCapacity request for resource %s within the %d second '
                        'threshold' %
                        (self.server_id, req.resource_id, _kMinimumInterval))
                    resources_to_skip.add(req.resource_id)
                else:
                    # Updates the state with the information in the request.
                    cr.last_request_time = now
                    cr.priority = req.priority
                    cr.wants = req.wants

                    if req.HasField('has'):
                        cr.has.CopyFrom(req.has)
                    else:
                        cr.ClearField('has')

        # Creates a new response object in which we will insert the responses for
        # the resources contained in the request.
        response = GetCapacityResponse()

        # Step 2: Loop through all the individual resource requests in the request
        # and hand out capacity.
        for req in request.resource:
            # If this is a resource we need to skip, let's skip it.
            if req.resource_id in resources_to_skip:
                continue

            # Finds the resource and the client state for this resource.
            (resource,
             cr) = (self.state.find_client_resource(request.client_id,
                                                    req.resource_id))

            # Adds a response proto to the overall response.
            resp = response.response.add()
            resp.resource_id = req.resource_id

            # If this is an unknown resource just give the client whatever it
            # is asking for.
            if not resource:
                assert not cr

                logger.warning(
                    '%s GetCapacity request for unmanaged resource %s' %
                    (self.server_id, req.resource_id))
                resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources
                resp.gets.capacity = req.wants
            else:
                # Sets the safe capacity in the response if there is one
                # configured for this resource.
                if resource.template.HasField('safe_capacity'):
                    resp.safe_capacity = resource.template.safe_capacity

                # Finds the algorithm implementation object for this resource.
                algo = AlgorithmImpl.create(resource.template,
                                            self.server_level)

                # If the resource is in learning mode we just return whatever the client
                # has now and create a default lease.
                if resource.learning_mode_expiry_time >= now:
                    if cr.HasField('has'):
                        has_now = cr.has.capacity
                    else:
                        has_now = 0

                    cr.has.CopyFrom(algo.create_lease(resource, has_now))
                    Counter.get('server.learning_mode_response').inc()
                else:
                    # Otherwise we just run the algorithm. This will update the
                    # client state object.
                    algo.run_client(resource, cr)
                    Counter.get('server.algorithm_runs').inc()

                # Copies the output from the algorithm run into the response.
                resp.gets.CopyFrom(cr.has)

            assert resp.IsInitialized()
            logger.info(
                '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d'
                % (self.server_id, request.client_id, req.resource_id,
                   req.wants, resp.gets.capacity, resp.gets.expiry_time - now,
                   resp.gets.refresh_interval))

        assert response.IsInitialized()

        timer.stop_timer()

        return response

    # This is the main function of the pseudo-thread. It needs to
    # figure out what needs to be done, then do it, and return
    # the timestamp when the next action needs to be scheduled.
    def thread_continue(self):
        # If we are not the master server in the job, we don't need to
        # do anything. Our next scheduled action is at the end of time.
        # Note: When we become the master we will update this interval.
        if not self.is_master():
            Counter.get('server.halt_thread').inc()
            return _kTheEndOfTime

        # If this is not the root server we might need to do a discovery.
        if self.server_level > 0:
            # If we don't know who the master is let's figure this out.
            if not self.master:
                # If discovery failed, try another discovery in the
                # near future
                if not self._discover():
                    return _kDefaultDiscoveryInterval

        # Either we know who the master is or we don't need to know because
        # we are the root server. Let's get some capacity. If this
        # fails we need to reschedule a discovery.
        if not self._get_capacity():
            Counter.get('server.reschedule_discovery').inc()
            self.master = None

            return 0

        # Returns the interval in which we need to refresh our capacity
        # leases.
        return self._renew_capacity_interval()
Exemplo n.º 4
0
class Server(object):
  # Used to generate server identifiers.
  num_servers = dict()

  # Constructor.
  def __init__(self, job, job_name, server_level, downstream_job=None):
    if server_level == 0:
      assert downstream_job is None
    else:
      assert downstream_job is not None

    self.job = job
    self.downstream_job = downstream_job
    self.master = None
    self.server_level = server_level

    Server.num_servers.setdefault(job_name, 0)
    Server.num_servers[job_name] += 1
    self.server_id = '%s:%d' % (job_name, Server.num_servers[job_name])

    self.state = ServerStateWrapper(self)

    # Kick off the pseudo-thread to do discovery and get resource
    # capacity.
    scheduler.add_thread(self, 0)

  def get_server_id(self):
    return self.server_id

  def get_server_level(self):
    return self.server_level

  def is_master(self):
    return self.state.get_election_victory_time() != None

  # Tells this server that it is no longer the master. This will reset
  # the internal state.
  def lose_mastership(self):
    assert self.is_master()

    logger.info('%s losing mastership' % self.server_id)
    self.state.reset()

  # Tells this server that it has become the master (as result of a
  # master election having been triggered).
  def become_master(self):
    assert not self.is_master()

    logger.info('%s becoming master' % self.server_id)
    self.state.assert_clean()
    self.state.set_election_victory_time()

    # Wake up the thread that does discovery and getting capacity.
    scheduler.update_thread(self, 0)

  # Returns the reporting data for this server. Just delegates to the
  # wrapped state object.
  def get_reporting_data(self, resource_id):
    assert self.is_master()

    return self.state.get_reporting_data(resource_id)

  # Sends a Discovery RPC to a random task in the server job.
  # This differs from the _discover method in the client code that
  # here we are not interested in the safe capacities. Returns a
  # reference to the server tasks that is the master, or None
  # if we did not find one.
  def _discover(self):
    assert self.server_level > 0

    request = DiscoveryRequest()
    request.client_id = self.server_id

    # Sends the request to a random task in the server job.
    response = self.downstream_job.get_random_task().Discovery_RPC(request)

    # If the response has a master_bns field we store the reference
    # to the master. If not there is no master, which would suck.
    if response.HasField('master_bns'):
      self.master = self.downstream_job.get_task_by_name(response.master_bns)
    else:
      self.master = None
      logger.warning('%s doesn\'t know who the master is.' % self.server_id)
      Counter.get('server.discovery_failure').inc()

    return self.master

  # Implements the Discovery RPC.
  def Discovery_RPC(self, request):
    assert request.IsInitialized()

    timer = Gauge.get('server.DiscoveryRPC.latency')
    timer.start_timer()
    logger.info(
        '%s handling Discovery RPC from %s' %
        (self.server_id, request.client_id))
    response = DiscoveryResponse()

    # Sets the master_bns field in the response if there is a current
    # master.
    master = self.job.get_master()

    if master:
      response.master_bns = master.get_server_id()
    else:
      # We don't know who the master is.
      Counter.get('server.incomplete_discovery_response').inc()

    # Goes through the resource ids in the request and sets the
    # safe capacity for every resource that has a safe capacity
    # configured.
    for r in request.resource_id:
      t = global_config.find_resource_template(r)

      if t and t.HasField('safe_capacity'):
        safe = response.safe_capacity.add()
        safe.resource_id = r
        safe.safe_capacity = t.safe_capacity

    assert response.IsInitialized()

    timer.stop_timer()

    return response

  # Figured out when to execute the next _get_capacity call. The interval is determined by the
  # refresh_interval settings of the resources in the state.
  def _renew_capacity_interval(self):
    # Figures out the smallest refresh_interval in the server state.
    delay = sys.maxint

    for resource in self.state.all_resources():
      if resource.HasField('has'):
        delay = min(delay, resource.has.refresh_interval)

    # If that delay is highly improbable we have some error and we use
    # a default delay. This might for instance happen if all resources
    # have lost their (or never gotten any) leases.
    if delay <= 0 or delay == sys.maxint:
      logger.error(
          '%s improbable delay %d, set to %d instead' %
          (self.server_id, delay, _kDefaultRefreshInterval))
      delay = _kDefaultRefreshInterval
      Counter.get('server.improbable.delay').inc()

    return delay

  # Get some capacity from the master downstream server.
  def _get_capacity_downstream(self):
    response = self.master.GetServerCapacity_RPC(
        self.state.fill_server_capacity_request())

    # Did the RPC fail?
    if not response:
      return False

    # Work the response into the state.
    self.state.process_capacity_response(response)

    return True

  # Get some capacity for this server to hand out. Returns a boolean to
  # indicate whether this succeeded or failed.
  def _get_capacity(self):
    assert self.is_master()

    now = clock.get_time()

    # Assume the worst... :-)
    success = False

    # If we are server level 0, we need to get the capacity from the
    # configuration.
    if self.server_level == 0:
      for resource in self.state.all_resources():
        algo = AlgorithmImpl.create(resource.template, self.server_level)
        resource.ClearField('has')
        resource.has.CopyFrom(
            algo.create_lease(resource, resource.template.capacity))

        # Note, we set a refresh interval here even though the capacity we get from the
        # configuration lasts forever. However by setting a refresh interval and relatively
        # short leases we ensure that configuration changes (e.g. from CDD) are
        # picked up.
        resource.has.refresh_interval *= 2

      success = True
    else:
      # If this is not the root server it gets its capacity from
      # a downstream server.
      success = self._get_capacity_downstream()

    logger.info('%s resource state after getting capacity:' % self.server_id)

    for resource in self.state.all_resources():
      logger.info(
          'resource: %s got: %lf lease: %d refresh: %d' %
          (resource.resource_id, resource.has.capacity,
           resource.has.expiry_time - now, resource.has.refresh_interval))

    return success

  # Implements the GetServerCapacity RPC.
  def GetServerCapacity_RPC(self, request):
    assert request.IsInitialized()
    assert self.state.is_initialized()

    # Only the master can handle this RPC.
    if not self.is_master():
      self.state.assert_clean()
      logger.info(
          '%s getting a GetServerCapacity request when not master' %
          self.server_id)

      Counter.get('server.GetServerCapacity_RPC.not_master').inc()

      return None

    gauge = Gauge.get('server.GetServerCapacity_RPC.latency')
    gauge.start_timer()
    logger.debug(request)
    now = clock.get_time()

    # Cleans the state. This removes resources and clients with expired
    # leases and such.
    self.state.cleanup()

    # A set of resources that we need to skip in step 2 (the actual
    # handing out of capacity.
    resources_to_skip = set()

    # First step: Go through the request and update the state with the
    # information from the request.
    for req in request.resource:
      (resource, sr) = (
          self.state.find_server_resource(
              request.server_id,
              req.resource_id))

      # If this resource does not exist we don't need to do anything right now.
      if resource:
        assert sr

        # Checks whether the last request from this server was at least
        # _kMinimumInterval seconds ago.
        if sr.HasField('last_request_time') and now - sr.last_request_time < _kMinimumInterval:
          logger.warning(
              '%s GetServerCapacity request for resource %s within the %d '
              'second threshold' %
              (self.server_id, req.resource_id, _kMinimumInterval))
          resources_to_skip.add(req.resource_id)
        else:
          # Updates the state with the information in the request.
          sr.last_request_time = now
          sr.outstanding = req.outstanding
          del sr.wants[:]

          for w in req.wants:
            sr.wants.add().CopyFrom(w)

          if req.HasField('has'):
            sr.has.CopyFrom(req.has)
          else:
            sr.ClearField('has')

    # Creates a new response object in which we will insert the response for
    # the resources contained in the request.
    response = GetServerCapacityResponse()

    # Step 2: Loop through all the individual resource requests in the request
    # and hand out capacity.
    for req in request.resource:
      # If this is a resource we need to skip, let's skip it.
      if req.resource_id in resources_to_skip:
        continue

      # Finds the resource and the client state for this resource.
      (resource, sr) = (
          self.state.find_server_resource(
              request.server_id,
              req.resource_id))

      # Adds a response proto to the overall response.
      resp = response.resource.add()
      resp.resource_id = req.resource_id

      # If this is an unknown resource just give the client whatever it
      # is asking for.
      if not resource:
        assert not sr

        logger.warning(
            '%s GetServerCapacity request for unmanaged resource %s' %
            (self.server_id, req.resource_id))
        resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources
        resp.gets.capacity = req.wants
      else:
        # Finds the algorithm implementation object for this resource.
        algo = AlgorithmImpl.create(resource.template, self.server_level)

        # If the resource is in learning mode we just return whatever the client
        # has now and create a default lease.
        if resource.learning_mode_expiry_time >= now:
          if sr.HasField('has'):
            has_now = sr.has.capacity
          else:
            has_now = 0

          sr.has.CopyFrom(algo.create_lease(resource, has_now))
        else:
          # Otherwise we just run the algorithm. This will update the
          # client state object.
          algo.run_server(resource, sr)

        # Copies the output from the algorithm run into the response.
        resp.gets.CopyFrom(sr.has)

      assert resp.IsInitialized()
      logger.info(
          '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' %
          (self.server_id, request.server_id, req.resource_id,
           sum([w.wants for w in req.wants]), resp.gets.capacity,
           resp.gets.expiry_time - now, resp.gets.refresh_interval))

    assert response.IsInitialized()

    gauge.stop_timer()

    return response

  # Implements the GetCapacity RPC.
  def GetCapacity_RPC(self, request):
    assert request.IsInitialized()
    assert self.state.is_initialized()

    # If this server is not the master it cannot handle this request.
    # The client should do a new Discovery.
    if not self.is_master():
      self.state.assert_clean()
      logger.info('%s getting a GetCapacity request when not master' %
                  self.server_id)
      Counter.get('server.GetCapacity_RPC.not_master').inc()

      return None

    timer = Gauge.get('server.GetCapacity_RPC.latency')
    timer.start_timer()
    logger.debug(request)
    now = clock.get_time()

    # Cleanup the state. This removes resources and clients with expired
    # leases and such.
    self.state.cleanup()

    # A set of resources that we need to skip in step 2 (the actual
    # handing out of capacity.
    resources_to_skip = set()

    # First step: Go through the request and update the state with the
    # information from the request.
    for req in request.resource:
       # Finds the resource and the client state for this resource.
      (resource, cr) = self.state.find_client_resource(
          request.client_id,
          req.resource_id)

      # If this resource does not exist we don't need to do anything
      # right now.
      if resource:
        assert cr

        # Checks whether the last request from this client was at least
        # _kMinimumInterval seconds ago.
        if cr.HasField('last_request_time') and now - cr.last_request_time < _kMinimumInterval:
          logger.warning(
              '%s GetCapacity request for resource %s within the %d second '
              'threshold' %
              (self.server_id, req.resource_id, _kMinimumInterval))
          resources_to_skip.add(req.resource_id)
        else:
          # Updates the state with the information in the request.
          cr.last_request_time = now
          cr.priority = req.priority
          cr.wants = req.wants

          if req.HasField('has'):
            cr.has.CopyFrom(req.has)
          else:
            cr.ClearField('has')

    # Creates a new response object in which we will insert the responses for
    # the resources contained in the request.
    response = GetCapacityResponse()

    # Step 2: Loop through all the individual resource requests in the request
    # and hand out capacity.
    for req in request.resource:
      # If this is a resource we need to skip, let's skip it.
      if req.resource_id in resources_to_skip:
        continue

      # Finds the resource and the client state for this resource.
      (resource, cr) = (
          self.state.find_client_resource(
              request.client_id,
              req.resource_id))

      # Adds a response proto to the overall response.
      resp = response.response.add()
      resp.resource_id = req.resource_id

      # If this is an unknown resource just give the client whatever it
      # is asking for.
      if not resource:
        assert not cr

        logger.warning(
            '%s GetCapacity request for unmanaged resource %s' %
            (self.server_id, req.resource_id))
        resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources
        resp.gets.capacity = req.wants
      else:
        # Sets the safe capacity in the response if there is one
        # configured for this resource.
        if resource.template.HasField('safe_capacity'):
          resp.safe_capacity = resource.template.safe_capacity

        # Finds the algorithm implementation object for this resource.
        algo = AlgorithmImpl.create(resource.template, self.server_level)

        # If the resource is in learning mode we just return whatever the client
        # has now and create a default lease.
        if resource.learning_mode_expiry_time >= now:
          if cr.HasField('has'):
            has_now = cr.has.capacity
          else:
            has_now = 0

          cr.has.CopyFrom(algo.create_lease(resource, has_now))
          Counter.get('server.learning_mode_response').inc()
        else:
          # Otherwise we just run the algorithm. This will update the
          # client state object.
          algo.run_client(resource, cr)
          Counter.get('server.algorithm_runs').inc()

        # Copies the output from the algorithm run into the response.
        resp.gets.CopyFrom(cr.has)

      assert resp.IsInitialized()
      logger.info(
          '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' %
          (self.server_id, request.client_id, req.resource_id, req.wants,
           resp.gets.capacity, resp.gets.expiry_time - now,
           resp.gets.refresh_interval))

    assert response.IsInitialized()

    timer.stop_timer()

    return response

  # This is the main function of the pseudo-thread. It needs to
  # figure out what needs to be done, then do it, and return
  # the timestamp when the next action needs to be scheduled.
  def thread_continue(self):
    # If we are not the master server in the job, we don't need to
    # do anything. Our next scheduled action is at the end of time.
    # Note: When we become the master we will update this interval.
    if not self.is_master():
      Counter.get('server.halt_thread').inc()
      return _kTheEndOfTime

    # If this is not the root server we might need to do a discovery.
    if self.server_level > 0:
      # If we don't know who the master is let's figure this out.
      if not self.master:
        # If discovery failed, try another discovery in the
        # near future
        if not self._discover():
          return _kDefaultDiscoveryInterval

    # Either we know who the master is or we don't need to know because
    # we are the root server. Let's get some capacity. If this
    # fails we need to reschedule a discovery.
    if not self._get_capacity():
      Counter.get('server.reschedule_discovery').inc()
      self.master = None

      return 0

    # Returns the interval in which we need to refresh our capacity
    # leases.
    return self._renew_capacity_interval()