def thread_continue(self): # If we are not the master server in the job, we don't need to # do anything. Our next scheduled action is at the end of time. # Note: When we become the master we will update this interval. if not self.is_master(): Counter.get('server.halt_thread').inc() return _kTheEndOfTime # If this is not the root server we might need to do a discovery. if self.server_level > 0: # If we don't know who the master is let's figure this out. if not self.master: # If discovery failed, try another discovery in the # near future if not self._discover(): return _kDefaultDiscoveryInterval # Either we know who the master is or we don't need to know because # we are the root server. Let's get some capacity. If this # fails we need to reschedule a discovery. if not self._get_capacity(): Counter.get('server.reschedule_discovery').inc() self.master = None return 0 # Returns the interval in which we need to refresh our capacity # leases. return self._renew_capacity_interval()
def _discover(self): assert self.state.IsInitialized() request = DiscoveryRequest() request.client_id = self.state.client_id # Adds all the resources we know about so that we can get # the safe capacities for them. for r in self.state.resource: request.resource_id.append(r.resource_id) # Sends the request to a random task in the server job. response = self.downstream_job.get_random_task().Discovery_RPC(request) # If the response has a master_bns field we store the reference # to the master. If not there is no master, which would suck. if response.HasField('master_bns'): self.master = self.downstream_job.get_task_by_name(response.master_bns) else: self.master = None logger.warning('%s doesn\'t know who the master is.' % self.state.client_id) Counter.get('client.discovery_failure').inc() # Goes through the response and stores all the safe capacities in the # client state. for safe in response.safe_capacity: self._find_resource(safe.resource_id).safe_capacity = safe.safe_capacity # Returns the server we just discovered to be the master. return self.master
def Discovery_RPC(self, request): assert request.IsInitialized() timer = Gauge.get('server.DiscoveryRPC.latency') timer.start_timer() logger.info('%s handling Discovery RPC from %s' % (self.server_id, request.client_id)) response = DiscoveryResponse() # Sets the master_bns field in the response if there is a current # master. master = self.job.get_master() if master: response.master_bns = master.get_server_id() else: # We don't know who the master is. Counter.get('server.incomplete_discovery_response').inc() # Goes through the resource ids in the request and sets the # safe capacity for every resource that has a safe capacity # configured. for r in request.resource_id: t = global_config.find_resource_template(r) if t and t.HasField('safe_capacity'): safe = response.safe_capacity.add() safe.resource_id = r safe.safe_capacity = t.safe_capacity assert response.IsInitialized() timer.stop_timer() return response
def _maybe_lease_expired(self, resource_id): resource = self._find_resource(resource_id) if lease_expired(resource): resource.ClearField('has') logger.info( '%s lease on capacity for resource %s expired' % (self.get_client_id(), resource.resource_id)) Counter.get('client.lease_expired').inc()
def _get_capacity(self): assert self.state.IsInitialized() assert self.master # If there are no resources in the state we're done. if len(self.state.resource) == 0: return # Creates the RPC request object. request = GetCapacityRequest() request.client_id = self.state.client_id # Goes through all the resources in the state and adds a subrequest # for that resource to the request. for res in self.state.resource: req = request.resource.add() req.resource_id = res.resource_id req.priority = res.priority req.wants = res.wants if res.HasField('has'): req.has.CopyFrom(res.has) # Calls the GetCapacity RPC in the master. response = self.master.GetCapacity_RPC(request) # If that failed we did not get new capacities. Blimey! Most probably # the server we talked to is no longer the master, so schedule a discovery # asap. When that discovery succeeds it will call us again. if not response: logger.error('%s GetCapacity request failed!' % self.get_client_id()) Counter.get('client.GetCapacity_RPC.failure').inc() return False else: # Goes through the response and copies the capacity information back into # the client state. for r in response.response: assert r.gets.capacity >= 0 resource = self._find_resource(r.resource_id) resource.has.CopyFrom(r.gets) # Schedules an action at the expiry time to clear out the lease. scheduler.add_absolute( resource.has.expiry_time, lambda: self._maybe_lease_expired(r.resource_id)) if r.HasField('safe_capacity'): resource.safe_capacity = r.safe_capacity else: resource.ClearField('safe_capacity') return True
def _maybe_lease_expired(self, resource_id): # If we are no longer the master this action does not need to' # be executed anymore. if not self.server.is_master(): return resource = self.find_resource(resource_id) if lease_expired(resource): resource.ClearField('has') logger.info('%s lease on capacity for resource %s expired' % (self.get_server_id(), resource.resource_id)) Counter.get('server.lease_expired').inc()
def _maybe_lease_expired(self, resource_id): # If we are no longer the master this action does not need to' # be executed anymore. if not self.server.is_master(): return resource = self.find_resource(resource_id) if lease_expired(resource): resource.ClearField('has') logger.info( '%s lease on capacity for resource %s expired' % (self.get_server_id(), resource.resource_id)) Counter.get('server.lease_expired').inc()
def random_mishap(): scheduler.add_relative(60, lambda: random_mishap()) total = max(_mishap_map.keys()) m = random.randint(0, total - 1) n = 0 for (key, value) in _mishap_map.iteritems(): if n >= m: Counter.get('mishap.%d' % key).inc() value() return n += key assert False
def _renew_capacity_interval(self): # Figures out the smallest refresh_interval in the server state. delay = sys.maxint for resource in self.state.all_resources(): if resource.HasField('has'): delay = min(delay, resource.has.refresh_interval) # If that delay is highly improbable we have some error and we use # a default delay. This might for instance happen if all resources # have lost their (or never gotten any) leases. if delay <= 0 or delay == sys.maxint: logger.error('%s improbable delay %d, set to %d instead' % (self.server_id, delay, _kDefaultRefreshInterval)) delay = _kDefaultRefreshInterval Counter.get('server.improbable.delay').inc() return delay
def _renew_capacity_interval(self): # Figures out the smallest refresh_interval in the client state. delay = sys.maxint for r in self.state.resource: if r.HasField('has'): delay = min(delay, r.has.refresh_interval) # If that delay is highly improbable we have some error and we use # a default delay. if delay <= 0 or delay == sys.maxint: logger.error( '%s improbable delay %d, set to %d instead' % (self.state.client_id, delay, _kDefaultRefreshInterval)) delay = _kDefaultRefreshInterval Counter.get('client.improbable.delay').inc() return delay
def _discover(self): assert self.server_level > 0 request = DiscoveryRequest() request.client_id = self.server_id # Sends the request to a random task in the server job. response = self.downstream_job.get_random_task().Discovery_RPC(request) # If the response has a master_bns field we store the reference # to the master. If not there is no master, which would suck. if response.HasField('master_bns'): self.master = self.downstream_job.get_task_by_name(response.master_bns) else: self.master = None logger.warning('%s doesn\'t know who the master is.' % self.server_id) Counter.get('server.discovery_failure').inc() return self.master
def _renew_capacity_interval(self): # Figures out the smallest refresh_interval in the server state. delay = sys.maxint for resource in self.state.all_resources(): if resource.HasField('has'): delay = min(delay, resource.has.refresh_interval) # If that delay is highly improbable we have some error and we use # a default delay. This might for instance happen if all resources # have lost their (or never gotten any) leases. if delay <= 0 or delay == sys.maxint: logger.error( '%s improbable delay %d, set to %d instead' % (self.server_id, delay, _kDefaultRefreshInterval)) delay = _kDefaultRefreshInterval Counter.get('server.improbable.delay').inc() return delay
def _discover(self): assert self.server_level > 0 request = DiscoveryRequest() request.client_id = self.server_id # Sends the request to a random task in the server job. response = self.downstream_job.get_random_task().Discovery_RPC(request) # If the response has a master_bns field we store the reference # to the master. If not there is no master, which would suck. if response.HasField('master_bns'): self.master = self.downstream_job.get_task_by_name( response.master_bns) else: self.master = None logger.warning('%s doesn\'t know who the master is.' % self.server_id) Counter.get('server.discovery_failure').inc() return self.master
def process_capacity_response(self, response): for resp in response.resource: assert resp.gets.capacity >= 0 resource = self.find_resource(resp.resource_id) n = sum_leases(resource) if resp.gets.capacity < n: logger.warning( '%s shortfall for %s: getting %lf, but has %lf outstanding leases' % (self.get_server_id(), resource.resource_id, resp.gets.capacity, n)) Counter.get('server_capacity_shortfall').inc() Gauge.get('server.%s.shortfall' % self.get_server_id()).set(resp.gets.capacity - n) resource.has.CopyFrom(resp.gets) # Schedules an action at the expirty time to clear out the lease. scheduler.add_absolute( resource.has.expiry_time, lambda: self._maybe_lease_expired(resource.resource_id))
def Discovery_RPC(self, request): assert request.IsInitialized() timer = Gauge.get('server.DiscoveryRPC.latency') timer.start_timer() logger.info( '%s handling Discovery RPC from %s' % (self.server_id, request.client_id)) response = DiscoveryResponse() # Sets the master_bns field in the response if there is a current # master. master = self.job.get_master() if master: response.master_bns = master.get_server_id() else: # We don't know who the master is. Counter.get('server.incomplete_discovery_response').inc() # Goes through the resource ids in the request and sets the # safe capacity for every resource that has a safe capacity # configured. for r in request.resource_id: t = global_config.find_resource_template(r) if t and t.HasField('safe_capacity'): safe = response.safe_capacity.add() safe.resource_id = r safe.safe_capacity = t.safe_capacity assert response.IsInitialized() timer.stop_timer() return response
def GetCapacity_RPC(self, request): assert request.IsInitialized() assert self.state.is_initialized() # If this server is not the master it cannot handle this request. # The client should do a new Discovery. if not self.is_master(): self.state.assert_clean() logger.info('%s getting a GetCapacity request when not master' % self.server_id) Counter.get('server.GetCapacity_RPC.not_master').inc() return None timer = Gauge.get('server.GetCapacity_RPC.latency') timer.start_timer() logger.debug(request) now = clock.get_time() # Cleanup the state. This removes resources and clients with expired # leases and such. self.state.cleanup() # A set of resources that we need to skip in step 2 (the actual # handing out of capacity. resources_to_skip = set() # First step: Go through the request and update the state with the # information from the request. for req in request.resource: # Finds the resource and the client state for this resource. (resource, cr) = self.state.find_client_resource(request.client_id, req.resource_id) # If this resource does not exist we don't need to do anything # right now. if resource: assert cr # Checks whether the last request from this client was at least # _kMinimumInterval seconds ago. if cr.HasField( 'last_request_time' ) and now - cr.last_request_time < _kMinimumInterval: logger.warning( '%s GetCapacity request for resource %s within the %d second ' 'threshold' % (self.server_id, req.resource_id, _kMinimumInterval)) resources_to_skip.add(req.resource_id) else: # Updates the state with the information in the request. cr.last_request_time = now cr.priority = req.priority cr.wants = req.wants if req.HasField('has'): cr.has.CopyFrom(req.has) else: cr.ClearField('has') # Creates a new response object in which we will insert the responses for # the resources contained in the request. response = GetCapacityResponse() # Step 2: Loop through all the individual resource requests in the request # and hand out capacity. for req in request.resource: # If this is a resource we need to skip, let's skip it. if req.resource_id in resources_to_skip: continue # Finds the resource and the client state for this resource. (resource, cr) = (self.state.find_client_resource(request.client_id, req.resource_id)) # Adds a response proto to the overall response. resp = response.response.add() resp.resource_id = req.resource_id # If this is an unknown resource just give the client whatever it # is asking for. if not resource: assert not cr logger.warning( '%s GetCapacity request for unmanaged resource %s' % (self.server_id, req.resource_id)) resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources resp.gets.capacity = req.wants else: # Sets the safe capacity in the response if there is one # configured for this resource. if resource.template.HasField('safe_capacity'): resp.safe_capacity = resource.template.safe_capacity # Finds the algorithm implementation object for this resource. algo = AlgorithmImpl.create(resource.template, self.server_level) # If the resource is in learning mode we just return whatever the client # has now and create a default lease. if resource.learning_mode_expiry_time >= now: if cr.HasField('has'): has_now = cr.has.capacity else: has_now = 0 cr.has.CopyFrom(algo.create_lease(resource, has_now)) Counter.get('server.learning_mode_response').inc() else: # Otherwise we just run the algorithm. This will update the # client state object. algo.run_client(resource, cr) Counter.get('server.algorithm_runs').inc() # Copies the output from the algorithm run into the response. resp.gets.CopyFrom(cr.has) assert resp.IsInitialized() logger.info( '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' % (self.server_id, request.client_id, req.resource_id, req.wants, resp.gets.capacity, resp.gets.expiry_time - now, resp.gets.refresh_interval)) assert response.IsInitialized() timer.stop_timer() return response
def final_report(self): if self.filename: fout = open(self.filename, 'w') else: fout = sys.stdout # Prints the first header line. print >>fout, ',', for client in sorted(self.all_clients): print >>fout, '"%s"' % client, ',,', print >>fout, ',', for server in sorted(self.all_server_jobs): print >>fout, '"%s"' % server, ',,,,', print >>fout, ',', for s in sorted(self.all_summaries): print >>fout, '"%s"' % s, if s == 'clients': print >>fout, ',,', else: print >>fout, ',,,,', print >>fout # Prints the second header line. print >>fout, '"Time",', for client in sorted(self.all_clients): print >>fout, '"wants", "has",', print >>fout, ',', for server in sorted(self.all_server_jobs): print >>fout, '"wants", "has", "leases", "outstanding",', print >>fout, ',', for s in self.all_summaries: if s == 'clients': print >>fout, '"total_wants", "total_has",', else: print >>fout, ('"total_wants", "total_has", "total_leases", ' '"total_outstanding",'), print >>fout # Goes through the data set in timestamp order. for time in sorted(self.data.keys()): print >>fout, time, ',', data = self.data[time] # Prints the reporting data for every client and server that we ever saw. # If we have no data for a timestamp we print nothing. for client in sorted(self.all_clients): if client in data: d = data[client] print >>fout, d.wants, ',', d.has, ',', else: print >>fout, ',,', print >>fout, ',', # Do the same for the servers. for server in sorted(self.all_server_jobs): if server in data: d = data[server] print >>fout, d.wants, ',', d.has, ',', d.leases, ',', d.outstanding, ',', else: print >>fout, ',,,,', # Now for the summaries print >>fout, ',', data = self.summaries[time] for s in sorted(self.all_summaries): if not s in data: if s == 'clients': print >>fout, ',,', else: print >>fout, ',,,,', continue d = data[s] if s == 'clients': print >>fout, d.total_wants, ',', d.total_has, ',', else: print >>fout, d.total_wants, ',', d.total_has, ',', d.total_leases, ',', d.total_outstanding, ',', print >>fout # Now we go an print the counters. print >>fout print >>fout, '"Name", "Value"' names = list() for counter in Counter.all_counters(): names.append(counter.get_name()) for name in sorted(names): counter = Counter.get(name) print >>fout, counter.get_name(), ',', counter.get_value() # And all the gauges. print >>fout print >>fout, '"Name", "N", "Min", "Average", "Max"' names = list() for gauge in Gauge.all_gauges(): names.append(gauge.get_name()) for name in sorted(names): gauge = Gauge.get(name) print >>fout, gauge.get_name(), ',', gauge.get_count(), ',', gauge.get_min_value( ), ',', gauge.get_average(), ',', gauge.get_max_value() # Closes the output file. if self.filename: fout.close() logger.info('Report written to %s' % self.filename)
def GetCapacity_RPC(self, request): assert request.IsInitialized() assert self.state.is_initialized() # If this server is not the master it cannot handle this request. # The client should do a new Discovery. if not self.is_master(): self.state.assert_clean() logger.info('%s getting a GetCapacity request when not master' % self.server_id) Counter.get('server.GetCapacity_RPC.not_master').inc() return None timer = Gauge.get('server.GetCapacity_RPC.latency') timer.start_timer() logger.debug(request) now = clock.get_time() # Cleanup the state. This removes resources and clients with expired # leases and such. self.state.cleanup() # A set of resources that we need to skip in step 2 (the actual # handing out of capacity. resources_to_skip = set() # First step: Go through the request and update the state with the # information from the request. for req in request.resource: # Finds the resource and the client state for this resource. (resource, cr) = self.state.find_client_resource( request.client_id, req.resource_id) # If this resource does not exist we don't need to do anything # right now. if resource: assert cr # Checks whether the last request from this client was at least # _kMinimumInterval seconds ago. if cr.HasField('last_request_time') and now - cr.last_request_time < _kMinimumInterval: logger.warning( '%s GetCapacity request for resource %s within the %d second ' 'threshold' % (self.server_id, req.resource_id, _kMinimumInterval)) resources_to_skip.add(req.resource_id) else: # Updates the state with the information in the request. cr.last_request_time = now cr.priority = req.priority cr.wants = req.wants if req.HasField('has'): cr.has.CopyFrom(req.has) else: cr.ClearField('has') # Creates a new response object in which we will insert the responses for # the resources contained in the request. response = GetCapacityResponse() # Step 2: Loop through all the individual resource requests in the request # and hand out capacity. for req in request.resource: # If this is a resource we need to skip, let's skip it. if req.resource_id in resources_to_skip: continue # Finds the resource and the client state for this resource. (resource, cr) = ( self.state.find_client_resource( request.client_id, req.resource_id)) # Adds a response proto to the overall response. resp = response.response.add() resp.resource_id = req.resource_id # If this is an unknown resource just give the client whatever it # is asking for. if not resource: assert not cr logger.warning( '%s GetCapacity request for unmanaged resource %s' % (self.server_id, req.resource_id)) resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources resp.gets.capacity = req.wants else: # Sets the safe capacity in the response if there is one # configured for this resource. if resource.template.HasField('safe_capacity'): resp.safe_capacity = resource.template.safe_capacity # Finds the algorithm implementation object for this resource. algo = AlgorithmImpl.create(resource.template, self.server_level) # If the resource is in learning mode we just return whatever the client # has now and create a default lease. if resource.learning_mode_expiry_time >= now: if cr.HasField('has'): has_now = cr.has.capacity else: has_now = 0 cr.has.CopyFrom(algo.create_lease(resource, has_now)) Counter.get('server.learning_mode_response').inc() else: # Otherwise we just run the algorithm. This will update the # client state object. algo.run_client(resource, cr) Counter.get('server.algorithm_runs').inc() # Copies the output from the algorithm run into the response. resp.gets.CopyFrom(cr.has) assert resp.IsInitialized() logger.info( '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' % (self.server_id, request.client_id, req.resource_id, req.wants, resp.gets.capacity, resp.gets.expiry_time - now, resp.gets.refresh_interval)) assert response.IsInitialized() timer.stop_timer() return response