def mark_software_available(software, version): url = "{master_api}/agents/{agent}/software/".format( master_api=config.get("master_api"), agent=config.get("agent_id")) while True: try: response = yield post_direct(url, data={ "software": software, "version": version}) except Exception as error: delay = http_retry_delay() logger.error( "Failed to post availability of software %s, " "version %s to master: %r. Will retry in %s " "seconds.", software, version, error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: data = yield treq.content(response) if response.code == OK: logger.info("Posted availability of software %s, " "version %s to master.", software, version) break elif response.code >= INTERNAL_SERVER_ERROR: delay = http_retry_delay() logger.warning( "Could not post availability of software %s, " "version %s. The master responded with " "INTERNAL_SERVER_ERROR. Retrying in %s " "seconds.", software, version, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: logger.error( "Failed to post availability of software %s, " "version %s: " "Unexpected status from server %s. Data: %s", software, version, response.code, data) break if self.testing: self.operation_deferred.callback(None)
def post_agent_to_master(self): """ Runs the POST request to contact the master. Running this method multiple times should be considered safe but is generally something that should be avoided. """ url = self.agents_endpoint() data = self.system_data() try: response = yield post_direct(url, data=data) except Exception as failure: delay = http_retry_delay() if isinstance(failure, ConnectionRefusedError): svclog.error( "Failed to POST agent to master, the connection was " "refused. Retrying in %s seconds", delay) else: # pragma: no cover svclog.error( "Unhandled error when trying to POST the agent to the " "master. The error was %s.", failure) if not self.shutting_down: svclog.info( "Retrying failed POST to master in %s seconds.", delay) yield deferLater(reactor, delay, self.post_agent_to_master) else: svclog.warning("Not retrying POST to master, shutting down.") else: # Master might be down or have some other internal problems # that might eventually be fixed. Retry the request. if response.code >= INTERNAL_SERVER_ERROR: if not self.shutting_down: delay = http_retry_delay() svclog.warning( "Failed to post to master due to a server side error " "error %s, retrying in %s seconds", response.code, delay) yield deferLater(reactor, delay, self.post_agent_to_master) else: svclog.warning( "Failed to post to master due to a server side error " "error %s. Not retrying, because the agent is " "shutting down", response.code) # Master is up but is rejecting our request because there's # something wrong with it. Do not retry the request. elif response.code >= BAD_REQUEST: text = yield response.text() svclog.error( "%s accepted our POST request but responded with code %s " "which is a client side error. The message the server " "responded with was %r. Sorry, but we cannot retry this " "request as it's an issue with the agent's request.", url, response.code, text) else: data = yield treq.json_content(response) config["agent_id"] = data["id"] config.master_contacted() if response.code == OK: svclog.info( "POST to %s was successful. Agent %s was updated.", url, config["agent_id"]) elif response.code == CREATED: svclog.info( "POST to %s was successful. A new agent " "with an id of %s was created.", url, config["agent_id"]) returnValue(data)
def post_shutdown_to_master(self): """ This method is called before the reactor shuts down and lets the master know that the agent's state is now ``offline`` """ # We're under the assumption that something's wrong with # our code if we try to call this method before self.shutting_down # is set. assert self.shutting_down yield self.post_shutdown_lock.acquire() svclog.info("Informing master of shutdown") # Because post_shutdown_to_master is blocking and needs to # stop the reactor from finishing we perform the retry in-line data = None tries = 0 num_retry_errors = 0 response = None timed_out = False while True: tries += 1 try: response = yield post_direct( self.agent_api(), data={ "state": AgentState.OFFLINE, "free_ram": memory.free_ram(), "current_assignments": config["current_assignments"]}) except (ResponseNeverReceived, RequestTransmissionFailed) as error: num_retry_errors += 1 if num_retry_errors > config["broken_connection_max_retry"]: svclog.error( "Failed to post shutdown to the master, " "caught try-again errors %s times in a row.", num_retry_errors) break elif self.shutdown_timeout < datetime.utcnow(): svclog.error("While posting shutdown to master, caught " "%s. Shutdown timeout has been reached, not " "retrying.", error.__class__.__name__) break else: svclog.debug("While posting shutdown to master, caught " "%s. Retrying immediately.", error.__class__.__name__) # When we get a hard failure it could be an issue with the # server, although it's unlikely, so we retry. Only retry # for a set period of time though since the shutdown as a timeout except Exception as failure: if self.shutdown_timeout > datetime.utcnow(): delay = http_retry_delay() svclog.warning( "State update failed due to unhandled error: %s. " "Retrying in %s seconds", failure, delay) # Wait for 'pause' to fire, introducing a delay pause = Deferred() reactor.callLater(delay, pause.callback, None) yield pause else: timed_out = True svclog.warning( "State update failed due to unhandled error: %s. " "Shutdown timeout reached, not retrying.", failure) break else: data = yield treq.json_content(response) if response.code == NOT_FOUND: svclog.warning( "Agent %r no longer exists, cannot update state.", config["agent_id"]) break elif response.code == OK: svclog.info( "Agent %r has POSTed shutdown state change " "successfully.", config["agent_id"]) break elif response.code >= INTERNAL_SERVER_ERROR: if self.shutdown_timeout > datetime.utcnow(): delay = http_retry_delay() svclog.warning( "State update failed due to server error: %s. " "Retrying in %s seconds.", data, delay) # Wait for 'pause' to fire, introducing a delay pause = Deferred() reactor.callLater(delay, pause.callback, None) yield pause else: timed_out = True svclog.warning( "State update failed due to server error: %s. " "Shutdown timeout reached, not retrying.", data) break yield self.post_shutdown_lock.release() extra_data = { "response": response, "timed_out": timed_out, "tries": tries, "retry_errors": num_retry_errors } if isinstance(data, dict): data.update(extra_data) else: data = extra_data returnValue(data)
def reannounce(self, force=False): """ Method which is used to periodically contact the master. This method is generally called as part of a scheduled task. """ # Attempt to acquire the reannounce lock but fail after 70% # of the total time between reannouncements elapses. This should # help prevent an accumulation of requests in the event the master # is having issues. try: yield self.reannounce_lock.acquire( config["agent_master_reannounce"] * .70 ) except utility.LockTimeoutError: svclog.debug("Timed out while waiting to acquire reannounce_lock") returnValue(None) if not self.should_reannounce() and not force: yield self.reannounce_lock.release() returnValue(None) svclog.debug("Announcing %s to master", config["agent_hostname"]) data = None num_retry_errors = 0 while True: # for retries try: response = yield post_direct( self.agent_api(), data={ "state": config["state"], "current_assignments": config.get( "current_assignments", {} # may not be set yet ), "free_ram": memory.free_ram(), "disks": disks.disks(as_dict=True) } ) except (ResponseNeverReceived, RequestTransmissionFailed) as error: num_retry_errors += 1 if num_retry_errors > config["broken_connection_max_retry"]: svclog.error( "Failed to announce self to the master, " "caught try-again type errors %s times in a row.", num_retry_errors) break else: svclog.debug("While announcing self to master, caught " "%s. Retrying immediately.", error.__class__.__name__) except Exception as error: if force: delay = http_retry_delay() svclog.error( "Failed to announce self to the master: %s. Will " "retry in %s seconds.", error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: # Don't retry because reannounce is called periodically svclog.error( "Failed to announce self to the master: %s. This " "request will not be retried.", error) break else: data = yield treq.json_content(response) if response.code == OK: config.master_contacted(announcement=True) svclog.info("Announced self to the master server.") break elif response.code >= INTERNAL_SERVER_ERROR: if not self.shutting_down: delay = http_retry_delay() svclog.warning( "Could not announce self to the master server, " "internal server error: %s. Retrying in %s " "seconds.", data, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: svclog.warning( "Could not announce to master. Not retrying " "because of pending shutdown.") break elif response.code == NOT_FOUND: svclog.warning("The master says it does not know about our " "agent id. Posting as a new agent.") yield self.post_agent_to_master() break # If this is a client problem retrying the request # is unlikely to fix the issue so we stop here elif response.code >= BAD_REQUEST: svclog.error( "Failed to announce self to the master, bad " "request: %s. This request will not be retried.", data) break else: svclog.error( "Unhandled error when posting self to the " "master: %s (code: %s). This request will not be " "retried.", data, response.code) break yield self.reannounce_lock.release() returnValue(data)
def post(self, url, **kwargs): return post_direct(self.get_url(url), **kwargs)