def __init__(self, monque, task, args, kwargs, config): self.config = Configuration(**config) self.config.parent = task.config self.monque = monque self.task = task self.name = task.get_name() self.args = args self.kwargs = kwargs self.collection = None self.id = None self.doc = None self.logger = self.task.logger self.queue = self.config.get("queue", "default") self.priority = self.config.get("priority", None) self.start_time = self.get_start_time() self.result = None self.max_in_queue = int(self.config.get("max_in_queue", 0)) self.max_running = int(self.config.get("max_running", 0)) self.must_be_unique = self.config.get("must_be_unique", False) self.unique_kwargs = self.config.get("unique_kwargs", None)
def connect(self, init=False): self.logger.debug("Monque.connect()") if not self.connection: host = self.config.get("mongo.host", "localhost") if ":" in host: host, port = host.split(":", 1) port = int(port) else: port = int(self.config.get("mongo.port", 27017)) self.connection = pymongo.MongoClient(host, port) db_name = self.config.get("mongo.db", "monque") self.db = self.connection[db_name] self.global_config = Configuration.get_global(self.db, self.config.get("mongo.config", "config")) self.config.parent = self.global_config self.get_collections(init=init)
def init_config(self,kwargs): """ Create a config object that encapsulates all the config settings for this task, including those inherited from base classes, from the queue, etc """ self.config = Configuration(**kwargs) self.config.parent = self.monque.config # Iterate through the list of base classes to get configs from class members. # Iteration is done in reverse order, so the lowest in the class hierarchy # will 'stick' in the final config mro = [x for x in self.__class__.__mro__] mro = filter(lambda cls: cls != object,mro) mro.reverse() for cls in mro: #print "mro cls=%s" % (cls) for k,v in cls.__dict__.iteritems(): if k.startswith('__'): continue self.config.set(k,v)
class Task(object): """ A Task object can be executed remotely via the queue, or can be executed directly as a callable or just invoking the run() method. A Task object is actually more of an 'actor', in that the remote workers will only instantiate the Task class once, and reuse that instance every time it is processing a Task of that type. So implementation should be careful not to store state in the instance (except perhaps cached data to be reused for future invocations of the task) """ def __init__(self,**kwargs): self.monque = kwargs.pop('monque',None) if not self.monque: self.monque = monque.instance.current_instance if not self.monque: raise Task.NoQueue("No Monque queue instance") self.logger = kwargs.pop('logger',None) if not self.logger: self.logger = self.monque.get_logger() if self.monque else \ logging.getLogger('monque.task') self.init_config(kwargs) def init_config(self,kwargs): """ Create a config object that encapsulates all the config settings for this task, including those inherited from base classes, from the queue, etc """ self.config = Configuration(**kwargs) self.config.parent = self.monque.config # Iterate through the list of base classes to get configs from class members. # Iteration is done in reverse order, so the lowest in the class hierarchy # will 'stick' in the final config mro = [x for x in self.__class__.__mro__] mro = filter(lambda cls: cls != object,mro) mro.reverse() for cls in mro: #print "mro cls=%s" % (cls) for k,v in cls.__dict__.iteritems(): if k.startswith('__'): continue self.config.set(k,v) def get_name(self): return self.__class__.__name__ def __call__(self,*args, **kwargs): return self.run(*args,**kwargs) def run(self, *args, **kwargs): """ This is to be implemented by subclasses """ raise NotImplementedError() def is_run_by_worker(self): from monque.worker import Worker return isinstance(self.monque,Worker) def post(self,args=[],kwargs={},**config): """ Submit this task to the queue to be executed by a (remote) worker. Result is a TaskRemote instance that can be used to monitor progress and get results back. """ return self.monque.post(self,args,kwargs,config) @classmethod def find_task_class(klass,class_name): all_subclasses = klass.find_all_task_classes() # Find fullname match if possible: for sub in all_subclasses: full_name = sub.__module__ + '.' + sub.__name__ if full_name == class_name: return sub # Find short name match: short = class_name.split('.')[-1] for sub in all_subclasses: if sub.__name__ == short: return sub raise Task.ClassNotFound(class_name) @classmethod def find_all_task_classes(klass): all = [] subs = [s for s in klass.__subclasses__()] while subs: sub = subs.pop(0) all.append(sub) for subsub in sub.__subclasses__(): subs.append(subsub) not_obsolete = filter(lambda sub: '__obsolete__' not in sub.__dict__,all) return sorted(not_obsolete) class NoQueue(Exception): pass class ClassNotFound(Exception): pass
class PostedTask(object): """ In-memory representation of a task posted (or to be posted) to a queue. """ def __init__(self, monque, task, args, kwargs, config): self.config = Configuration(**config) self.config.parent = task.config self.monque = monque self.task = task self.name = task.get_name() self.args = args self.kwargs = kwargs self.collection = None self.id = None self.doc = None self.logger = self.task.logger self.queue = self.config.get("queue", "default") self.priority = self.config.get("priority", None) self.start_time = self.get_start_time() self.result = None self.max_in_queue = int(self.config.get("max_in_queue", 0)) self.max_running = int(self.config.get("max_running", 0)) self.must_be_unique = self.config.get("must_be_unique", False) self.unique_kwargs = self.config.get("unique_kwargs", None) def get_start_time(self): absolute = self.config.get("at") if absolute: if isinstance(absolute, datetime.datetime): return absolute elif type(absolute) == int or type(absolute) == float: return datetime.datetime.fromtimestamp(absolute) raise ValueError("Unrecognized format of 'at': %s" % (absolute)) delay = self.config.get("delay") if delay: if isinstance(absolute, datetime.timedelta): return datetime.datetime.utcnow() + delay elif type(delay) == int or type(delay) == float: return datetime.datetime.utcnow() + datetime.timedelta(seconds=delay) raise ValueError("Unrecognized format of 'delay': %s"(delay)) return None def save_into(self, collection): """ This is where the task actually gets inserted into the collection. TODO: options for write concern, etc? """ # self.logger.debug("Task save_into() collection=%s task=%s args=%s kwargs=%s" % # (collection,self.name,self.args,self.kwargs)) if not self.doc: self.doc = self.serialize() # self.logger.debug("Task save_into() doc=%s" % (self.doc)) collection.save(self.doc) self.id = self.doc["_id"] # self.logger.debug("Task save_into() id=%s" % (self.id)) def mark_running(self): if self.doc: self.doc["status"] = "running" self.doc["started_at"] = datetime.datetime.utcnow() if self.collection and self.id: self.collection.find_and_modify( query={"_id": self.id}, update={"$set": {"status": self.doc["status"], "started_at": self.doc["started_at"]}}, ) def remove(self): if self.collection and self.id: self.collection.find_and_modify(query={"_id": self.id}, remove=True) def serialize(self): """ Return a serialized version (dict) of the task, as it is to be stored in the collection """ doc = { "name": self.name, "class": self.task.__module__ + "." + self.task.__class__.__name__, "queue": self.queue, "payload": {"args": self.args, "kwargs": self.kwargs}, "constraints": {}, "created_at": datetime.datetime.utcnow(), "submitted_at": datetime.datetime.utcnow(), "status": "pending", } # Add constraints: if self.priority is not None: doc["constraints"]["priority"] = self.priority if self.start_time: doc["constraints"]["start_time"] = self.start_time if self.max_in_queue: doc["constraints"]["max_in_queue"] = self.max_in_queue if self.max_running: doc["constraints"]["max_running"] = self.max_running if self.must_be_unique: doc["constraints"]["must_be_unique"] = True if self.unique_kwargs: doc["constraints"]["unique_kwargs"] = self.unique_kwargs return doc def notify_workers(self, collection): """ Add a doc to the collection (the activity log) that indicates new tasks in the queue, so workers that are tailing the collection can immediately pick it up """ collection.insert({"task": self.id, "queue": self.queue}) def notify_results(self, collection): """ Add a doc to the collection (the activity log) that indicates task results are available, so clients waiting for the results can immediately pick it up """ collection.insert({"result": self.id}) @classmethod def get_next(klass, **kwargs): collection = kwargs.pop("collection") queue = kwargs.pop("queue", None) worker = kwargs.pop("worker", None) # Set up the queury filters: query = {"status": "pending"} if queue: if type(queue) == str: query["queue"] = queue elif type(queue) == list: if len(queue) == 1: query["queue"] = queue[0] else: query["queue"] = {"$in": queue} now = datetime.datetime.utcnow() query["$or"] = [{"constraints.start_time": {"$exists": False}}, {"constraints.start_time": {"$lte": now}}] # As soon as it is picked up, mark it as 'taken', # which is the pre-cursor state to 'running', # in which pre-run conditions are checked, etc update = {"$set": {"status": "taken", "taken_at": datetime.datetime.utcnow(), "worker": worker}} found = collection.find_and_modify( query=query, update=update, new=True, sort=[("constraints.priority", pymongo.DESCENDING), ("_id", pymongo.ASCENDING)], ) return found def unget(self): """ Put a task back into the queue that was 'incorrectly' taken. Usually this is for a task that is taken, then one or more pre-execution tasks fails (e.g. too many running tasks of a given type) """ if not self.collection or not self.id: return self.logger.debug("Task unget() id=%s" % (self.id)) self.collection.find_and_modify(query={"_id": self.id}, update={"$et": {"status": "pending"}}) def wait(self, timeout=None): """ Wait for the results of the task to be posted to the result queue. If timeout (given in seconds) is not None, then wait at least that long for the result. If no result is available within that time, returns None. If the result is received, the result is returned back. """ query = {"result": self.id} expire_at = None if timeout: expire_at = time.time() + timeout while expire_at is None or time.time() < expire_at: tail = self.monque.activity_log.find(query, tailable=True, await_data=False) got = False for doc in tail: got = True if got: break time.sleep(0.1) result = self.monque.results_collection.find_one(self.id) if result: return self.handle_result(result) return None def handle_result(self, result): self.result = result status = self.result.get("status", None) if status == "completed": return self.result["result"] elif status == "failed": exception = self.result["exception"] raise PostedTask.RuntimeException(exception) class RuntimeException(Exception): pass