def cancel(self, request_id): """ Immediately mark a request as cancelled, and in the background try and cancel any outstanding JID for it. """ request = self._by_request_id[request_id] # Idempotent behaviour: no-op if already cancelled if request.state == request.COMPLETE: return with self._update_index(request): # I will take over cancelling the JID from the request cancel_jid = request.jid request.jid = None # Request is now done, no further calls request.set_error("Cancelled") request.complete() # In the background, try to cancel the request's JID on a best-effort basis if cancel_jid: client = LocalClient(config.get('cthulhu', 'salt_config_path')) client.run_job(request.minion_id, 'saltutil.kill_job', [cancel_jid])
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() client = LocalClient(config.get('cthulhu', 'salt_config_path')) # TODO clean up unused 'since' argument pub_data = client.run_job( minion_id, 'ceph.get_cluster_object', condition_kwarg( [], { 'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None })) if not pub_data: log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) # Don't throw an exception because if a fetch fails we should always else: log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
def get_running(self, minions): client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(minions, 'saltutil.running', [], expr_form="list") if not pub_data: log.warning( "Failed to publish saltutil.running to {0}".format(minions))
def run_job(self, fqdn, cmd, args): client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(fqdn, cmd, condition_kwarg([], args)) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise Unavailable() else: return pub_data['jid']
def run_job_sync(self, fqdn, cmd, args, timeout=None): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, cmd, args, timeout=timeout) if results: if isinstance(fqdn, list): return results else: return results[fqdn] else: raise Unavailable()
def _submit(self): client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, self._cmd, self._args) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta( seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (request.id, request.jid, _now, request.alive_at)) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}". format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format( query_minions))
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % ( request.id, request.jid, _now, request.alive_at )) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() client = LocalClient(config.get('cthulhu', 'salt_config_path')) # TODO clean up unused 'since' argument pub_data = client.run_job(minion_id, 'ceph.get_cluster_object', condition_kwarg([], {'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None})) if not pub_data: log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) # Don't throw an exception because if a fetch fails we should always else: log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
def _submit(self, commands=None): if commands is None: commands = self._commands self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__, self._minion_id, self._cluster_name, commands)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, 'ceph.rados_commands', [self.fsid, self._cluster_name, commands]) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def cancel(self, fqdn, jid): client = LocalClient(config.get('cthulhu', 'salt_config_path')) client.run_job(fqdn, 'saltutil.kill_job', [jid])
def get_running(self, minions): client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(minions, 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format(minions))
def get_server_log(self, fqdn, log_path, lines): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, "log_tail.tail", [log_path, lines]) return results
def list_server_logs(self, fqdn): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, "log_tail.list_logs", ["."]) log.debug('list_server_log result !!! {results}'.format(results=str(results))) return results
def list_server_logs(self, fqdn): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, "log_tail.list_logs", ["."]) log.debug('list_server_log result !!! {results}'.format( results=str(results))) return results
def _run_by_salt(tgt, fun, cmd): client = LocalClient(SALT_CONFIG_PATH) return client.cmd(tgt, fun, cmd, timeout=5)