Exemplo n.º 1
0
class NodeScheduler(object):
	def __init__(self):
		interface = config.INTERFACE
		f = os.popen('ifconfig ' + str(interface) + ' | grep "inet\ addr" | cut -d: -f2 | cut -d" " -f1')
		self.identity = f.read().strip()
		self.queue_data_lock = threading.Lock()

		# Make a random hash for the queue.
		self.random_hash = hashlib.sha1(b'/tmp/Weasel/bin/local_resourcemanager').hexdigest()
		self.task_data = {}
		self.queue_data = {}
		self.max_tasks_to_run_for_queue = {}

		# Will hold the timestamp when we first received a task.
		self.time_start_running = -1

		# Initially we run four tasks per CPU core. This number might
		# be changed by the monitor thread based on resource contention.
		if config.ADAPT_TASKS:
			self.max_tasks_to_run = multiprocessing.cpu_count() * 2
		else:
			self.max_tasks_to_run = config.NR_COLOCATED_TASKS

		# Create the queue for the tasks.
		self.queue_data[self.random_hash] = {'qid': self.random_hash,
											 'asked': 0, 'recv': 0,
											 'tpool': ThreadPool(1, self.task_data)}
		tpool = self.queue_data[self.random_hash]['tpool']
		self.max_tasks_to_run_for_queue[self.random_hash] = self.max_tasks_to_run
		tpool.set_size(self.max_tasks_to_run)
		tpool.start()

		# Create the monitoring thread.
		self.monitor_thread = NodeMonitor(parent=self)

		# Create the thread that performs communication with
		# the scheduler.
		self.sched_client_thread = ZmqConnectionThread(
			self.identity,
			zmq.DEALER,
			config.SCHEDULER + ":" + str(config.ZMQ_SCHEDULER_PORT),
			self.callback)
		self.running = True
		logfile = config.LOGDIR + "/local_scheduler.log"
		self.logger = WeaselLogger('local_scheduler', logfile)
		self.ntasks_to_ask = 1
		self.task_id = 1
		self.time_asked_first = time.time()
		self.running_task = 0
		self.nr_received_tasks = 0
		self.nran_tasks = []
		self.queues_asked_for = []
		self.current_ntasks = 1
		self.has_new_task = False
		self.first_task = False
		self.running_task_lock = threading.Lock()

		# Will hold a map of nr_colocated_tasks => [runtimes]
		self.task_runtimes = {}
		self.task_runtime_lock = threading.Lock()
		self.sleep_time = config.WAITTIME
		self.task_time = 1
		self.has_new_queue = False
		self.new_queues = []
		self.logger.info("NodeScheduler started...")
		self.nrunning_past_period = []

		# Will hold the last started executable.
		self.last_started = None

	def run_task(self, arg):
		command_id = arg['id']
		command = arg['exec'] + ' ' + arg['params']
		qid = arg['qid']
		myid = threading.current_thread().ident

		# Tell the monitor thread we have started a task.
		self.monitor_thread.task_started(myid)

		# Increment the number of running tasks.
		self.running_task_lock.acquire()
		self.running_task += 1
		nr_colocated = self.running_task

		# The first time we run a task of a different type, we reset
		# the history on the monitor thread.
		if self.last_started is None:
			self.last_started = arg['exec']
		if self.last_started != arg['exec']:
			print('Started new task type: ' + str(arg['exec']))
			sys.stdout.flush()
			self.monitor_thread.reset_known_points()
			self.last_started = arg['exec']
		self.running_task_lock.release()

		start_time = time.time()
		proc = psutil.Popen(command, shell=True,
							stdout=PIPE, stderr=PIPE)
		self.task_data[myid]['lock'].acquire()
		self.task_data[myid]['proc'] = proc
		self.task_data[myid]['ctask'] = arg
		self.task_data[myid]['lock'].release()
		out, err = proc.communicate()
		return_code = proc.returncode
		if return_code != 0:
			print('Error when returning: ' + str(return_code))
			sys.stdout.flush()
		end_time = time.time()

		# Record task running times.
		self.task_runtime_lock.acquire()
		running_time = end_time - start_time
		if nr_colocated not in self.task_runtimes:
			self.task_runtimes[nr_colocated] = []
		self.task_runtimes[nr_colocated].append(running_time)
		print("Task %s ran in %s seconds (%s)" % (str(command_id), str(running_time), str(arg['exec'])))
		sys.stdout.flush()
		self.task_runtime_lock.release()

		# Tell the monitor thread we have finished a task.
		self.monitor_thread.task_finished(myid)

		self.task_data[myid]['lock'].acquire()
		self.task_data[myid]['ctask'] = None
		if self.task_data[myid]['task'].get(qid) == None:
			self.task_data[myid]['task'][qid] = []
			self.external_change = True
		self.task_data[myid]['task'][qid].append(
			[end_time - start_time, 100 * (end_time - start_time) / (end_time - start_time)])
		self.task_data[myid]['lock'].release()
		self.running_task_lock.acquire()
		self.running_task -= 1
		self.nran_tasks.append(command_id)
		self.running_task_lock.release()

	def get_total_queue_size(self):
		queue_size = 0
		self.queue_data_lock.acquire()
		for qid in self.queue_data:
			queue_size = queue_size + self.queue_data[qid]['tpool'].tasks.qsize()
		self.queue_data_lock.release()
		return queue_size

	def get_tasks_to_ask(self):
		"""
		Returns the number of tasks to ask from the scheduler. Tries to keep the queue size at
		least as long as the maximum allowed number of tasks at the moment.
		:return:
		"""
		tasks_to_ask = {}
		self.queues_asked_for = []
		queue_size = self.get_total_queue_size()
		self.queue_data_lock.acquire()
		for qid in self.queue_data:
			tasks_to_ask[qid] = 0
			self.queue_data[qid]['asked'] = 0
			self.queue_data[qid]['recv'] = 0
			qsize = self.queue_data[qid]['tpool'].tasks.qsize()
			if qsize > 2 * self.max_tasks_to_run_for_queue[qid] and self.max_tasks_to_run_for_queue[qid] != -1:
				continue
			if qsize == 0:
				tasks_to_ask[qid] = self.max_tasks_to_run_for_queue[qid]
			else:
				if qsize > self.max_tasks_to_run_for_queue[qid] and self.max_tasks_to_run_for_queue[qid] != -1:
					continue
				elif qsize < self.max_tasks_to_run_for_queue[qid]:
					tasks_to_ask[qid] = self.max_tasks_to_run_for_queue[qid] - qsize
			self.queues_asked_for.append(qid)
			self.queue_data[qid]['asked'] = tasks_to_ask[qid]
		self.queue_data_lock.release()
		return tasks_to_ask, queue_size

	def wait_and_ask(self):
		while self.running:
			# check at 0.2 seconds
			time.sleep(0.2)

			self.running_task_lock.acquire()
			nrunning = self.running_task
			self.nrunning_past_period.append(nrunning)
			task_data_to_send = {'ran': self.nran_tasks[:]}
			self.nran_tasks = []
			self.running_task_lock.release()
			(tasks_to_ask, queue_size) = self.get_tasks_to_ask()
			task_data_to_send['qsize'] = queue_size * self.task_time
			pickled_data = pickle.dumps(task_data_to_send)
			if len(tasks_to_ask) > 0:
				self.sched_client_thread.put_request_in_queue(
					[self.identity, PROTOCOL_HEADERS['WORKER'], 'task', pickle.dumps(tasks_to_ask), pickled_data])

	def process_task(self, task):
		tmp = task.split(';')
		task_name = tmp[-1].split()[0].split('/')[-1]
		new_task = False
		return new_task

	def add_task_to_queues(self, tasks):
		for task in tasks['tasks']:
			self.running_task_lock.acquire()
			self.nr_received_tasks += 1
			self.running_task_lock.release()
			new_task = self.process_task(task['exec'])
			task_hash = hashlib.sha1(task['exec'].encode()).hexdigest()
			self.has_new_task |= new_task
			task['qid'] = task_hash
			self.queue_data[self.random_hash]['tpool'].add_task(self.run_task, task)

	def get_latest_task_type(self):
		"""
		Return the latest started task type.
		:return:
		"""
		self.running_task_lock.acquire()
		latest = self.last_started
		self.running_task_lock.release()
		return latest

	def running_identical_tasks(self):
		"""
		Returns whether or not the worker is currently only running tasks of the same type.
		:return:
		"""
		current = None
		task_threads = self.task_data.keys()
		try:
			for task_thread in task_threads:
				if (task_thread not in self.task_data) or ('lock' not in self.task_data[task_thread]):
					continue
				self.task_data[task_thread]['lock'].acquire()
				if 'ctask' in self.task_data[task_thread] and self.task_data[task_thread]['ctask'] is not None:
					if current is None:
						current = self.task_data[task_thread]['ctask']['exec']
					elif current != self.task_data[task_thread]['ctask']['exec']:
						self.task_data[task_thread]['lock'].release()
						return False
				self.task_data[task_thread]['lock'].release()
			return True
		except Exception, e:
			print('Got exception while trying to determine identical tasks')
			print(e)
			sys.stdout.flush()