def __init__(self): self.running = True self.resource_manager = ResourceManager(self) self.resource_queues = {'cpu': [], 'memory': [], 'bw': []} ''' this is to have multiple applications/queues; application = queue ''' self.task_queue = [] # ready queues; [queue_id] self.task_queue_data = {} # ready queue data; queue_id: [task...] # queues with pending tasks, which do not have the input ready self.pending_queue_data = {} self.task_queue_lock = threading.Lock() self.pending_task_queue_lock = threading.Lock() logfile = config.LOGDIR + "/scheduler.log" taskfile = config.LOGDIR + "/task.log" self.logger = WeaselLogger('scheduler', logfile) self.task_logger = WeaselLogger('tasklogger', logfile) ''' this is the thread that will perform the communication with the local schedulers ''' self.server_thread = ZmqConnectionThread( 'resourcemng', zmq.ROUTER, "*:" + str( config.ZMQ_SCHEDULER_PORT), self.msg_process_callback) ''' this information is for keeping track of files and task dependencies ''' self.task_to_file = { } # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]} # file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks} self.file_to_task = {} self.result_queue = Queue() self.result_consumer_thread = threading.Thread(target=self.get_result) self.result_consumer_thread.start() ''' idle worker information and client notification ''' self.waiting_clients = [] self.workers_empty_dict = {} self.workers = [] self.workers_data = {} self.workers_lock = threading.Lock() self.workers_empty = 0 self.task_id = 0 ''' to notify workers about new queues ''' self.new_queues = [] self.new_queues_lock = threading.Lock() ''' here I have: the thread that listens for messages a queue in which the tasks are put the main thread that applies some reconfiguration (?) ''' self.files_to_delete = []
class Scheduler(object): stdin = "/dev/null" stdout = "/dev/null" stderr = "/dev/null" def __init__(self): self.running = True self.resource_manager = ResourceManager(self) self.resource_queues = {'cpu': [], 'memory': [], 'bw': []} ''' this is to have multiple applications/queues; application = queue ''' self.task_queue = [] # ready queues; [queue_id] self.task_queue_data = {} # ready queue data; queue_id: [task...] # queues with pending tasks, which do not have the input ready self.pending_queue_data = {} self.task_queue_lock = threading.Lock() self.pending_task_queue_lock = threading.Lock() logfile = config.LOGDIR + "/scheduler.log" taskfile = config.LOGDIR + "/task.log" self.logger = WeaselLogger('scheduler', logfile) self.task_logger = WeaselLogger('tasklogger', logfile) ''' this is the thread that will perform the communication with the local schedulers ''' self.server_thread = ZmqConnectionThread( 'resourcemng', zmq.ROUTER, "*:" + str( config.ZMQ_SCHEDULER_PORT), self.msg_process_callback) ''' this information is for keeping track of files and task dependencies ''' self.task_to_file = { } # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]} # file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks} self.file_to_task = {} self.result_queue = Queue() self.result_consumer_thread = threading.Thread(target=self.get_result) self.result_consumer_thread.start() ''' idle worker information and client notification ''' self.waiting_clients = [] self.workers_empty_dict = {} self.workers = [] self.workers_data = {} self.workers_lock = threading.Lock() self.workers_empty = 0 self.task_id = 0 ''' to notify workers about new queues ''' self.new_queues = [] self.new_queues_lock = threading.Lock() ''' here I have: the thread that listens for messages a queue in which the tasks are put the main thread that applies some reconfiguration (?) ''' self.files_to_delete = [] def delete_queue(self, qid): del self.task_queue_data[qid] self.task_queue.remove(qid) self.new_queues_lock.acquire() try: self.new_queues.remove(qid) except: pass self.new_queues_lock.release() def get_taskids(self): tasks_ids = [] try: tasks_ids = self.result_queue.get( block=True, timeout=8 * config.WAITTIME) except: self.pending_task_queue_lock.acquire() pending_queue = self.pending_queue_data for pqueue in pending_queue: to_delete = [] for tid in pending_queue[pqueue]: current_inputs = 0 tinputs = len(pending_queue[pqueue][tid]['inputs']) for inputf in pending_queue[pqueue][tid]['inputs']: if os.path.isfile(inputf): current_inputs = current_inputs + 1 if current_inputs == tinputs: task = pending_queue[pqueue][tid] data = { 'id': tid, 'exec': task['exec'], 'params': task['params']} to_delete.append(tid) self.task_queue_lock.acquire() is_new = False try: self.task_queue_data[pqueue].append(data) except: self.task_queue.append(pqueue) self.task_queue_data[pqueue] = deque() self.task_queue_data[pqueue].append(data) is_new = True self.task_queue_lock.release() if is_new: self.new_queues_lock.acquire() self.new_queues.append(pqueue) self.new_queues_lock.release() for tid in to_delete: del self.pending_queue_data[pqueue][tid] self.pending_task_queue_lock.release() return tasks_ids def check_dependencies_per_task(self, taskid): for fileid in self.task_to_file[taskid]['outputs']: dependent_tasks = [] try: dependent_tasks = self.file_to_task[ hashlib.sha1(fileid.encode()).hexdigest()]['tids'] except: pass for taskid2 in dependent_tasks: self.task_to_file[taskid2]['cinputs'] = self.task_to_file[taskid2]['cinputs'] + 1 if self.task_to_file[taskid2]['cinputs'] == self.task_to_file[taskid2]['tinputs']: # put in ready queue self.pending_task_queue_lock.acquire() try: task = self.pending_queue_data[self.task_to_file[taskid2]['queueid']][taskid2] except: self.pending_task_queue_lock.release() continue self.pending_task_queue_lock.release() data = { 'id': taskid2, 'exec': task['exec'], 'params': task['params']} self.task_queue_lock.acquire() is_new = False try: self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data) except: self.task_queue.append(self.task_to_file[taskid2]['queueid']) # FCFS like self.task_queue_data[self.task_to_file[taskid2]['queueid']] = deque() self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data) is_new = True self.task_queue_lock.release() if is_new: self.new_queues_lock.acquire() self.new_queues.append(self.task_to_file[taskid2]['queueid']) self.new_queues_lock.release() self.pending_task_queue_lock.acquire() del self.pending_queue_data[ self.task_to_file[taskid2]['queueid']][taskid2] self.pending_task_queue_lock.release() def garbage_collect(self, taskid): for fileid in self.task_to_file[taskid]['inputs']: try: self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] = self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] + 1 if self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] == self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ntids']: # now it is safe to delete this file (is it?) try: print "deleting file ", fileid, "ctids=", \ self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ctids'] \ , "ntids=", self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ntids'] # os.remove(fileid) # if failed, report it back to the user ## except OSError as e: print "Error: %s - %s." % (e.filename, e.strerror) except: print "exception for file ", fileid def get_result(self): while self.running: self.pending_task_queue_lock.acquire() to_delete = [] for pqueue in self.pending_queue_data: if len(self.pending_queue_data[pqueue]) == 0: to_delete.append(pqueue) for pqueue in to_delete: del self.pending_queue_data[pqueue] self.pending_task_queue_lock.release() tasks_ids = self.get_taskids() if len(tasks_ids) == 0: continue for taskid in tasks_ids: # if missing files were generated put task in ready queue try: self.check_dependencies_per_task(taskid) except: traceback.print_exc() def msg_process_callback(self, message): message_type = message[3] try: if message_type == PROTOCOL_HEADERS['CLIENT']: self.process_client_message(message) elif message_type == PROTOCOL_HEADERS['WORKER']: self.process_worker_message(message) except: traceback.print_exc() def process_queue(self, data, qid): task_data = pickle.loads(data) # Analyze the given DAG. self.resource_manager.initialize_dag(task_data) self.task_queue_lock.acquire() self.pending_task_queue_lock.acquire() for task in task_data: self.process_queue_task(task, qid) self.pending_task_queue_lock.release() self.task_queue_lock.release() def process_queue_task(self, task_data, qid): self.task_id = self.task_id + 1 splited_inputs = task_data['inputs'].split() total_inputs = len(splited_inputs) current_inputs = 0 for inputf in splited_inputs: if os.path.isfile(inputf): current_inputs = current_inputs + 1 try: self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['tids'].append(self.task_id) self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['ntids'] = self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['ntids'] + 1 except: self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()] = { 'ntids': 1, 'tids': [], 'ctids': 0} self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()]['tids'] = [ self.task_id] self.task_to_file[self.task_id] = { 'queueid': qid, 'tinputs': total_inputs, 'cinputs': current_inputs, 'inputs': splited_inputs, 'outputs': task_data['outputs'].split()} if current_inputs == total_inputs: print "Putting task ", self.task_id, " in active queue", qid if qid not in self.task_queue: try: self.new_queues_lock.acquire() self.new_queues.append(qid) self.new_queues_lock.release() self.task_queue.append(qid) # FCFS like self.task_queue_data[qid] = deque() except: traceback.print_exc() self.task_queue_data[qid].append( {'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params']}) else: print( "Putting task %s in pending queue, total inputs: %s" % (self.task_id, total_inputs)) task_info = { 'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params'], 'inputs': splited_inputs} try: self.pending_queue_data[qid][self.task_id] = task_info except: self.pending_queue_data[qid] = {} self.pending_queue_data[qid][self.task_id] = task_info def process_status(self, qid): self.task_queue_lock.acquire() ntasks = len(self.task_queue_data[qid]) self.task_queue_lock.release() reply = [ message[0], PROTOCOL_HEADERS['RSMNG'], 'status', str(ntasks)] self.server_thread.put_request_in_queue(reply) def process_wait(self, qid, message): self.workers_lock.acquire() self.workers_emtpy = 0 self.workers_empty_dict = {} self.workers_lock.release() self.waiting_clients.append(message[0]) def process_delete(self, data): worker_id = 'sched-' + data with open(os.devnull, 'w') as devnull: proc = subprocess.Popen( "ssh %s \" pkill -9 local_resourcem \" " % data, stdout=devnull, stderr=devnull) proc.wait() # we delete all info about the worker.... self.workers_lock.acquire() self.workers.remove(worker_id) del self.workers_data[worker_id] self.workers_lock.release() def process_client_message(self, message): tmp_msg = message[4:] qid = message[1] action = tmp_msg[0] data = None if len(tmp_msg) > 1: data = tmp_msg[1] print "I receive message for queue ", qid, " action: ", action if action == 'queue': if data is None: # print "Missing task information" return self.process_queue(data, qid) elif action == 'status': self.process_status(qid) elif action == 'wait': self.process_wait(qid, message) elif action == 'clear': print "I will empty the worker queues" for worker in self.workers: self.server_thread.put_request_in_queue( [worker, PROTOCOL_HEADERS['RSMNG'], 'empty']) elif action == 'delete': self.process_delete(data) else: print "Not implemented yet" def process_worker_nonempty_queue_static(self, tasks_per_queue, answer): queue_id = self.task_queue[0] ntasks = min( len(self.task_queue_data[queue_id]), int(tasks_per_queue)) for i in range(0, ntasks): task = self.task_queue_data[queue_id].popleft() answer['tasks'].append(task) if len(self.task_queue_data[queue_id]) == 0: self.delete_queue(queue_id) return answer def process_worker_nonempty_queue_static1(self, tasks_per_queue, answer): # send only from the first queue queue_id = self.task_queue[0] if queue_id not in tasks_per_queue: ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue.values()[0])) else: ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue[queue_id])) for i in range(0, ntasks): task = self.task_queue_data[queue_id].popleft() answer['tasks'].append(task) if len(self.task_queue_data[queue_id]) == 0: self.delete_queue(queue_id) return answer def process_worker_nonempty_queue(self, worker_id, type, data): ''' FCFS queue for the static policy ''' randn = 0 queue_id = self.task_queue[randn] answer = {'queues': {}, 'tasks': []} if type == 'task_empty': tasks_per_queue = data answer = self.process_worker_nonempty_queue_static(tasks_per_queue, answer) else: tasks_per_queue = pickle.loads(data) answer = self.process_worker_nonempty_queue_static1(tasks_per_queue, answer) for qid in self.task_queue_data: if qid not in tasks_per_queue: answer['queues'][qid] = 0 # TODO : piggyback the ids of other queues, if no queues > 1 ''' if we don't have any data left in the queue we delete it ''' ''' if the worker sent us an empty message reset it ''' self.workers_lock.acquire() if self.workers_empty_dict.get(worker_id): del self.workers_empty_dict[worker_id] self.workers_lock.release() return answer def process_worker_message(self, message): tmp_msg = message[4:] type = tmp_msg[0] # If we are dealing with a resource or statistics message, let the resource manager handle it. if type == 'resource' or type == 'statistics': self.resource_manager.process_worker_message(message) return else: data = None first_worker = False self.workers_lock.acquire() if message[0] not in self.workers: self.workers.append(message[0]) self.workers_data[message[0]] = 0 if len(tmp_msg) > 1: data = tmp_msg[1] if len(tmp_msg) > 2: data2 = pickle.loads(tmp_msg[2]) computed_tasks = data2['ran'] worker_current_size = data2['qsize'] self.workers_data[message[0]] = float(worker_current_size) if len(computed_tasks) > 0: # Tell the resource manager that this worker has finished these tasks. self.resource_manager.finished_tasks(message[0], computed_tasks) self.result_queue.put(computed_tasks) if type == 'task_empty': self.workers_data[message[0]] = 0 self.workers_lock.release() if type == 'task' or type == 'task_empty': ''' send x tasks to it ''' ''' this is a hack; it works only with one client ''' self.task_queue_lock.acquire() task_queue_len = len(self.task_queue) self.task_queue_lock.release() self.pending_task_queue_lock.acquire() pending_task_queue_len = len(self.pending_queue_data) self.pending_task_queue_lock.release() if type == 'task_empty' and task_queue_len == 0 \ and pending_task_queue_len == 0 \ and len(self.waiting_clients) > 0: self.workers_lock.acquire() if not self.workers_empty_dict.get( message[0]) and not first_worker: self.workers_empty_dict[message[0]] = 1 self.workers_lock.release() answer = {'queues': {}, 'tasks': []} wake_client = False self.task_queue_lock.acquire() task_queue_len = len(self.task_queue) sent_tasks = False if task_queue_len > 0: sent_tasks = True answer = self.process_worker_nonempty_queue(message[0], type, data) else: self.workers_lock.acquire() if pending_task_queue_len == 0 and len( self.workers_empty_dict) >= len( self.workers): wake_client = True self.workers_lock.release() self.task_queue_lock.release() if len(answer['tasks']) > 0: data = pickle.dumps(answer) self.server_thread.put_request_in_queue( [message[0], PROTOCOL_HEADERS['RSMNG'], 'task', data]) # Inform the resource manager we have sent tasks. self.resource_manager.sent_tasks_to_worker(message[0], answer['tasks']) if wake_client: for client in self.waiting_clients: # calculate the cost self.resource_manager.calculate_cost() print "Sending wake up message to ", client self.server_thread.put_request_in_queue( [client, PROTOCOL_HEADERS['RSMNG'], 'done']) self.waiting_clients = [] self.workers_empty = 0 return if sent_tasks: return elif type == 'output': ''' un-serialize the output and append it to a log ''' output = pickle.loads(data) self.task_logger.info(output) def send_message(self, message): """ Convenience method to let the resoure manager send a message. :param message: :return: """ self.server_thread.put_request_in_queue(message) def run(self): self.server_thread.start() while self.running: try: ''' compute the number of workers and data nodes for each scheduling period ''' if self.running: time.sleep(config.WAITTIME) except KeyboardInterrupt: self.shutdown() except: traceback.print_exc(self.logger.path) try: time.sleep(config.WAITTIME) except KeyboardInterrupt: self.shutdown() print "Stopping communication thread...." sys.stdout.flush() self.logger.info("Stopping communication thread....") self.server_thread.stop() print "Joining communication thread...." sys.stdout.flush() self.logger.info("Joining communication thread....") self.server_thread.join() if config.START_MEMFS: print "Stopping MemFS" sys.stdout.flush() self.resource_manager.stop_memfs() print "DONE" self.logger.info("DONE") return def shutdown(self): print "Received signal to shutdown. " sys.stdout.flush() self.logger.info("Received signal to shutdown. Will wait for the end of the \ scheduling period") self.running = False