def cancel(self, request_id): """ Immediately mark a request as cancelled, and in the background try and cancel any outstanding JID for it. """ request = self._by_request_id[request_id] with self._update_index(request): request.set_error("Cancelled") request.complete() if request.jid: client = LocalClient(config.get('cthulhu', 'salt_config_path')) client.run_job(request.minion_id, 'saltutil.kill_job', [request.jid]) # We don't check for completion or errors from kill_job, it's a best-effort thing. If we're # cancelling something we will do our best to kill any subprocess but can't # any guarantees because running nodes may be out of touch with the calamari server. request.jid = None
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta( seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (request.id, request.jid, _now, request.alive_at)) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}". format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format( query_minions))
def _submit(self, commands): self.log.debug("Request._submit: %s/%s/%s" % (self._minion_id, self._cluster_name, commands)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, 'ceph.rados_commands', [self._fsid, self._cluster_name, commands]) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % ( request.id, request.jid, _now, request.alive_at )) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format(query_minions))