def set_job_attribute(self, scheduler, job_id, name, value, callback, submission): assert callback def my_callback(result): self.job_client_pool.return_object(job_client) result = self._pretty_result(result, scheduler.Machine) # massage results for use by standard callback cb_args = self._cb_args_dataless(result) callback(*cb_args) job_client = self.job_client_pool.get_object() self._setup_client(job_client, self.job_servers, # server lookup object scheduler.Machine, # host we want "setJobAttribute") # Make a job id parameter (see job wsdl) jobId = job_client.factory.create('ns0:JobID') jobId.job = job_id jobId.pool = scheduler.Pool jobId.scheduler = scheduler.Name jobId.submission.name = submission.Name jobId.submission.owner = submission.Owner # Make attribute parameter from name and value aviary_attr = job_client.factory.create('ns0:Attribute') aviary_attr.name = name aviary_attr.type = "STRING" aviary_attr.value = value t = CallThread(self.call_client_retry, my_callback, job_client, "setJobAttribute", jobId, aviary_attr) t.start()
def _control_job(self, scheduler, job_id, reason, submission, meth_name, *args, **kwargs): callback = "callback" in kwargs and kwargs["callback"] or None default = "default" in kwargs and kwargs["default"] or None timeout = "timeout" in kwargs and kwargs["timeout"] or 5 client = self.job_client_pool.get_object() self._setup_client(client, self.job_servers, # server lookup object scheduler.Machine, # host we want meth_name) meth = getattr(client.service, meth_name) # Make a job id parameter (see job wsdl) jobId = client.factory.create('ns0:JobID') jobId.job = job_id jobId.pool = scheduler.Pool jobId.scheduler = scheduler.Name jobId.submission.name = submission.Name jobId.submission.owner = submission.Owner if callback: def my_callback(result): self.job_client_pool.return_object(client) # Fix up the exception message if necessary result = self._pretty_result(result, scheduler.Machine) cb_args = self._cb_args_dataless(result) callback(*cb_args) t = CallThread(self.call_client_retry, my_callback, client, meth_name, jobId, reason) t.start() else: def my_process_results(result): # Fix up the exception message if necessary result = self._pretty_result(result, scheduler.Machine) return self._cb_args_dataless(result) res = self._call_sync(my_process_results, self.call_client_retry, client, meth_name, jobId, reason) self.job_client_pool.return_object(client) return res;
def get_job_summaries(self, submission, callback, machine_name): assert callback def to_int_seconds(dt): # Change a datetime.datetime into int seconds since epoch # Note, this works nicely if the datetime happens to include microseconds # since the call to timetuple will drop them. Stuff coming back from # condor should not have microseconds anyway. return int(time.mktime(dt.timetuple())) def get_string(job, attr): # Cast suds text types into str so we have standard Py types # Handles optional strings as well if hasattr(job, attr): return str(getattr(job, attr)) return "" def adapt(jobs): # Make an aviary job summary look like the canonical form # that cumin is expecting (actually the QMF form because of history). result = list() for job in jobs: cluster, proc = job.id.job.split(".") j = dict() j["ClusterId"] = int(cluster) j["Cmd"] = str(job.cmd) j["EnteredCurrentStatus"] = to_int_seconds(job.last_update) # Note, GlobalJobId here will not match the same value from # QMF because the qdate portion of the name is missing j["GlobalJobId"] = job.id.scheduler + \ "#" + job.id.job j["JobStatus"] = str(job.job_status) j["ProcId"] = int(proc) j["QDate"] = to_int_seconds(job.queued) # These may be null... j["Args"] = get_string(job, "args1") j["ReleaseReason"] = get_string(job, "released") j["HoldReason"] = get_string(job, "held") result.append(j) return result def my_callback(result): query_client.set_enable_attributes(False) self.query_client_pool.return_object(query_client) result = self._pretty_result(result, machine_name) if isinstance(result, Exception): callback(result, None) else: status = _AviaryCommon._get_status(result[0].status) if status == "OK" and hasattr(result[0], "jobs"): data = {"Jobs": adapt(result[0].jobs)} else: data = {"Jobs": None} callback(status, data) query_client = self.query_client_pool.get_object() self._setup_client(query_client, self.query_servers, # server lookup object machine_name, # host we want "getSubmissionSummary") # What we really want here is the job summaries from the # submission summary response. To get those, we have to # set an extra attribute on the client... query_client.set_enable_attributes(True) query_client.set_attributes({"includeJobSummaries": "true"}) # Make a submission id. (see query wsdl) subId = query_client.factory.create('ns0:SubmissionID') subId.name = submission.Name subId.owner = submission.Owner t = CallThread(self.call_client_retry, my_callback, query_client, "getSubmissionSummary", subId) t.start()
def submit_job(self, scheduler, ad, callback): assert callback def my_callback(result): # Turn this back off before we put it back in the pool # so allow_overrides isn't set for someone else... job_client.set_enable_attributes(False) self.job_client_pool.return_object(job_client) result = self._pretty_result(result, scheduler.Machine) if isinstance(result, Exception): callback(result, None) else: # the aviary response has the job id available, # we'll pass it anyway even though Cumin does not care # at the present time status = _AviaryCommon._get_status(result.status) if status == "OK" and hasattr(result, "id"): id = result.id else: id = None callback(status, id) job_client = self.job_client_pool.get_object() self._setup_client(job_client, self.job_servers, # server lookup object scheduler.Machine, # host we want "submitJob") # Set basic attributes in the order defined by aviary-job.wsdl. args = list() basic_attrs = ("Cmd", "Args", "Owner", "Iwd", "Submission") for attr in basic_attrs: try: args.append(ad[attr]) except: # Someone may be unhappy if this is a required param! # Let the downstream code generate an error pass # Add empty list for Aviary's basic requirement value... args.append([]) # and let's let Requirements remain an unrestricted expression so that # we can just pass through the value from Cumin without interfering. # To do that, we need to specify Requirements through the # "extras" fields and set allowOverrides to True. # (otherwise, Requirements will be limited to particular # resource constraint types defined by aviary) job_client.set_enable_attributes(True) job_client.set_attributes({"allowOverrides": True}) extras = list() for k, v in ad.iteritems(): # We don't need to send descriptors down to aviary # and basic_attrs have already been filled in if k == "!!descriptors" or k in basic_attrs: continue extra = job_client.factory.create('ns0:Attribute') extra.name = k # But we do need to look in descriptors to find expressions... if k in ad["!!descriptors"]: extra.type = "EXPRESSION" else: try: extra.type = self.type_to_aviary[type(v)] except KeyError: extra.type = "UNDEFINED" extra.value = v extras.append(extra) # Important, extras itself must be added as an embedded list or # suds will consider only a single item args.append(extras) t = CallThread(self.call_client_retry, my_callback, job_client, "submitJob", *args) t.start()
class WallabyOperations(object): ''' Wrapper around the Wallaby client library. ''' def __init__(self, broker_uri, refresh_interval=None, sasl_mech_list=None): ''' Constructor. broker_uri -- the URI used to connect to a QMF message broker where a Wallaby agent is connected. The simplest URI is just a hostname but a full URI can specify scheme://user/password@host:port or a subset of those components as long as the host is included. Examples: localhost localhost:5672 amqp://fred/[email protected]:1234 refresh_interval -- default refresh interval in seconds for all items maintained by WallabyOperations' internal caching thread. A value of None causes the caching thread to wait forever before refreshing an item after a successful call unless the refresh() method is used. The refresh interval may be set for items individually with the set_interval() method. sasl_mech_list -- restricts the list of sasl mechanisms that will be allowed when connecting to a QMF message broker. If the broker URL contains no credentials, default is ANONYMOUS. If the broker URL does contain credentials, default is 'PLAIN DIGEST-MD5' ''' self.broker_uri = broker_uri self.sasl_mech_list = get_sasl_mechanisms(broker_uri, sasl_mech_list) # A wallaby Store object self._store = None # A QMF broker self._broker = None # The cache maintenance thread self._maintain_cache = None # Stop the maintenance thread self._stop = False # Cached data. Each of the keys in this dictionary is the name of # an attribute on the Wallaby Store object, with the exception of # WBTypes.TAGS. The TAGS data is a subset of the GROUPS produced # in this module. self._cache = {WBTypes.NODES: self.CacheData(refresh_interval), WBTypes.GROUPS: self.CacheData(refresh_interval), WBTypes.FEATURES: self.CacheData(refresh_interval), WBTypes.TAGS: self.CacheData(refresh_interval, synthetic=self._generate_tag_data)} # Cache a list of nodes that are members of a tag self._nodes_by_tag = dict() # Store the name of the partition group so we can filter it out # of tags/groups that we return self._partition_group = None # Lock is used for synchronization with the caching thread and # for thread safety of any and all data that could be accessed # by multiple threads. self._lock = Lock() self._condition = Condition(self._lock) def start(self, retry_secs=5): ''' Start the caching thread. This thread will attempt to connect to the broker and retrieve a Store object from the Wallaby agent. If successful, it will periodically retrieve and cache data from Wallaby. Only one caching thread may run at a time. The thread may be restarted if it has previously been stopped. Note, for the moment start() and stop() are not thread safe. They should only be called from a single thread. retry_secs -- how often the caching thread will retry failed operations. This includes attempts to connect to the broker and retrieve a Store object as well as calls to Wallaby that return no data. ''' # The connection to the broker can actually take a long # time to complete. We don't want to hang a calling function, # so we handle the connection and retrieval of the # initial Store object from Wallaby in a thread. # (There may need to be more work here if the broker or wallaby # going away and coming back causes a problem, but with # manageConnections=True and well-known agent/object ids for # Wallaby it appears to recover on its own...) # Similarly, getting node lists etc may take a long time # especially over a slow network. So we use the same thread # to retrieve things like node lists at defined intervals. # 'self' here is really a term of art since this is a local # function, but it refers to the WallabyOperations object # so the code reads nicely def maintain_cache(self): # Get initinal connection and Store obect self.session = Session(manageConnections=True) self.broker = self.session.addBroker(self.broker_uri, mechanisms=self.sasl_mech_list) while not self._stop: self._store = self._get_store() if self._store is not None: setup(self._store) self._partition_group = self._store.getPartitionGroup().name log.debug("WallabyOperations: found wallaby store object") break # Check stop inside the lock to make sure that we don't miss # a signal or a "stop" that was set while we were iterating. self._condition.acquire() if not self._stop: self._condition.wait(retry_secs) self._condition.release() # Init remaining time til next update to 0 for each # cached item in case the thread was restarted for attr, val in self._cache.iteritems(): val.remaining = 0 # Okay, now we're ready to retrieve data while not self._stop: start_processing = time.time() for attr, val in self._cache.iteritems(): if self._stop: break # val.remaining is the number of seconds left before # the next update of this data item. None is "forever". # Synthetic items are not retreived from the store. if not val.synthetic and \ val.remaining is not None and val.remaining <= 0: d = get_values(attr, getattr, self._store, attr, []) # If the data is empty, _set_cache will leave the # remaining field set to 0 for the attribute so we # will try to get it again on our next retry. # Otherwise, remaining will be reset to the full # interval for this attribute. self._set_cache(attr, d) # Now handle the synthetics. val.synthetic generates # and stores it's own results. for attr, val in self._cache.iteritems(): if self._stop: break if val.synthetic and \ val.remaining is not None and val.remaining <= 0: get_values(attr, val.synthetic, *val.args) log.debug("WallabyOperations: total refresh processing time %s" \ % (time.time() - start_processing)) # Find out how long we should sleep for. # Based on min remaining times for all items # If minimum is 0 because we have items waiting # for a retry, we fall back on retry_secs as a minimum. sleep_time = self._find_min_remaining(min=retry_secs) self._condition.acquire() if not self._stop: # Could be signaled, so track the actual sleep time log.debug("WallabyOperations: cache thread sleeping for"\ " %s seconds" % sleep_time) bed_time = time.time() self._condition.wait(sleep_time) slept = time.time() - bed_time log.debug("WallabyOperations: cache thread slept for"\ " %s seconds" % slept) # When we wake up from sleep here, we already # have the lock so we might as well check refresh # and adjust the "remaining" values for attr, val in self._cache.iteritems(): if val.refresh: # Force an update val.remaining = 0 val.refresh = False elif val.remaining is not None: val.remaining -= slept self._condition.release() # Clear cache if we have been stopped.... for attr in self._cache: self._set_cache(attr, []) self._store = None # Have to clean up the broker try: self.session.delBroker(self.broker) except: pass #end maintain_cache def get_values(attr, call, *args): log.debug("WallabyOperations: refreshing %s" % attr) try: # Wallaby API uses extensions to __getattr__ on # the Store to retrieve objects from the Broker # and return a list of proxy objects. start = time.time() d = call(*args) except: d = [] delta = time.time() - start log.debug("WallabyOperations: %s seconds to refresh %s" % (delta, attr)) return d # Wrap the entire cache thread with an exception handler def wrap_maintain_cache(): try: maintain_cache(self) log.debug("WallabyOperations: cache maintenance thread exited") except: pass if self._maintain_cache is not None and \ self._maintain_cache.isAlive(): # No, you can't start another one. return False self._stop = False if self.broker_uri is not None: # self._maintain_cache = CallThread(cProfile.runctx('maintain_cache(self)', globals(), locals(), filename='sage.stats'), None) self._maintain_cache = CallThread(wrap_maintain_cache, None) self._maintain_cache.daemon = True self._maintain_cache.start() log.debug("WallabyOperations: start cache maintenance thread") return True return False def stop(self, wait=False, timeout=None): ''' Stop the caching thread. Wake the caching thread if asleep and cause it to exit. The thread may be restarted again with a call to start() once it has successfully exited. On exit, the thread will null out cached data. wait -- if True the call will block until the thread exits or "timeout" seconds has passed if "timeout" is not None. timeout -- how long to wait for the thread to exit if "wait" is True. A value of None means wait forever. Note, for the moment start() and stop() are not thread safe. They should only be called from a single thread. ''' if self._maintain_cache is not None: self._condition.acquire() self._stop = True self._condition.notify() self._condition.release() if wait and self._maintain_cache.isAlive(): log.debug("WallabyOperations: waiting for cache maintenance thread to exit") self._maintain_cache.join(timeout) log.debug("WallabyOperations: stopped cache maintenance thread") def refresh(self, *items): ''' Wake the caching thread if asleep and cause it to iterate. items -- what data to refresh. If "items" is an empty tuple, refresh all data otherwise refresh only the data specified. Attributes of WBTypes define valid values for elements of "items" ''' self._condition.acquire() try: if len(items) == 0: do_notify = True for attr, val in self._cache.iteritems(): val.refresh = True else: do_notify = False for attr in items: if attr in self._cache: do_notify = True self._cache[attr].refresh = True if do_notify: self._condition.notify() finally: self._condition.release() def get_data(self, which, valuefilter=None): ''' Return a list of cached values for the specified category. The values returned will be proxy objects constructed by the Wallaby client library. which -- specifies the category. Attributes of WBTypes define valid values for "which" ''' d = [] self._lock.acquire() try: if which in self._cache: d = self._cache[which].data.values() # Here we handle the possible filtering of node names if which == WBTypes.NODES: if valuefilter is not None and valuefilter["nodeName"] != "%%%": filter = valuefilter["nodeName"].replace("%", "") if filter != "": d = [value for value in d if value.name.find(filter) > -1] finally: self._lock.release() return d def get_names(self, which): ''' Return a list of cached names for the specified category. The values returned will be the names of objects constructed by the Wallaby client library. which -- specifies the category. Attributes of WBTypes define valid values for "which" ''' d = [] self._lock.acquire() try: if which in self._cache: d = self._cache[which].data.keys() finally: self._lock.release() return d def get_node_by_name(self, name): ''' Return a cached wallaby.Node object by name. If name does not designate a currently cached object, None is returned. ''' return self._lookup_by_name(WBTypes.NODES, name) def get_group_by_name(self, name): ''' Return a cached wallaby.Group object by name. If name does not designate a currently cached object, None is returned. ''' return self._lookup_by_name(WBTypes.GROUPS, name) def get_tag_by_name(self, name): ''' Return a cached wallaby.Tag object by name. If name does not designate a currently cached object, None is returned. ''' return self._lookup_by_name(WBTypes.TAGS, name) def get_feature_by_name(self, name): ''' Return a cached wallaby.Feature object by name. If name does not designate a currently cached object, None is returned. ''' return self._lookup_by_name(WBTypes.FEATURES, name) def get_node_names(self, tag): ''' Return a list of node names associated with the tag. The return result is a list containing the names of nodes in the tag group. ''' names = [] if type(tag) in (str, unicode): n = tag else: n = tag.name self._lock.acquire() try: if n in self._nodes_by_tag: names = self._nodes_by_tag[n] finally: self._lock.release() return names def get_tag_names(self, node): ''' Return a list of tag names associated with the node. The return result is a list containing the names of tags on the specified node. ''' names = [] n = None if type(node) in (str, unicode): n = node elif hasattr(node, "name"): n = node.name if n is None: log.debug("WallabyOperations: get_tag_names(), parameter 'node' yields no name, returning []") else: self._lock.acquire() try: if n in self._cache[WBTypes.NODES].data: names = self._cache[WBTypes.NODES].data[n].getTags() finally: self._lock.release() return names def create_tags(self, names): ''' Create new tags in the Wallaby store. Refresh the cached lists of groups and tags. ''' if self._store is None: log.debug("WallabyOperations: create_tag, store object not yet created") return False try: self._lock.acquire() try: for name in names: self._store.addTag(name) except Exception, e: log.debug("WallabyOperations: create_tag, exception suppressed, %s" % str(e)) return False finally: self._lock.release() return True def remove_tags(self, names): ''' Remove a set of tags from the Wallaby store. Check the cached list of tags for the tag name first. Refresh cached lists of groups, tags, and nodes. ''' if self._store is None: log.debug("WallabyOperations: remove_tag, store object not yet created") return False for name in names: if self.get_tag_by_name(name) is not None: try: self._store.removeGroup(name) except Exception, e: log.debug("WallabyOperations: remove_tag, exception suppressed, %s" % str(e)) return False return True