def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) except Exception: log.error("Failed to add new host", exc_info=True)
def _eval_remove_node(self): """ This function uses the sge stats to decide whether or not to remove a node from the cluster. """ qlen = len(self.stat.get_queued_jobs()) if qlen != 0: return if not self.has_cluster_stabilized(): return num_nodes = len(self._cluster.nodes) if num_nodes <= self.min_nodes: log.info("Not removing nodes: already at or below minimum (%d)" % self.min_nodes) return max_remove = num_nodes - self.min_nodes log.info("Looking for nodes to remove...") remove_nodes = self._find_nodes_for_removal(max_remove=max_remove) if not remove_nodes: log.info("No nodes can be removed at this time") for node in remove_nodes: if node.update() != "running": log.error("Node %s is already dead - not removing" % node.alias) continue log.warn("Removing %s: %s (%s)" % (node.alias, node.id, node.dns_name)) try: self._cluster.remove_node(node) self.__last_cluster_mod_time = utils.get_utc_now() except Exception: log.error("Failed to remove node %s" % node.alias, exc_info=True)
def get_all_stats(self): now = utils.get_utc_now() bits = [] # first field is the time bits.append(now) # second field is the number of hosts bits.append(self.count_hosts()) # third field is # of running jobs bits.append(len(self.get_running_jobs())) # fourth field is # of queued jobs bits.append(len(self.get_queued_jobs())) # fifth field is total # slots bits.append(self.count_total_slots()) # sixth field is average job duration bits.append(self.avg_job_duration()) # seventh field is average job wait time bits.append(self.avg_wait_time()) # last field is array of loads for hosts arr = self.get_loads() # arr may be empty if there are no exec hosts if arr: load_sum = float(reduce(self._add, arr)) avg_load = load_sum / len(arr) else: avg_load = 0.0 bits.append(avg_load) return bits
def execute(self, args): instance_types = ', '.join(sorted(static.INSTANCE_TYPES.keys())) if len(args) != 1: self.parser.error('please provide an instance type (options: %s)' % instance_types) if self.opts.classic and self.opts.vpc: self.parser.error("options -c and -v cannot be specified at " "the same time") instance_type = args[0] if instance_type not in static.INSTANCE_TYPES: self.parser.error('invalid instance type. possible options: %s' % instance_types) start = self.opts.start_time end = self.opts.end_time if self.opts.days_ago: if self.opts.start_time: self.parser.error("options -d and -s cannot be specified at " "the same time") if self.opts.end_time: end_tup = utils.iso_to_datetime_tuple(self.opts.end_time) else: end_tup = utils.get_utc_now() start = utils.datetime_tuple_to_iso(end_tup - timedelta( days=self.opts.days_ago)) browser_cmd = self.cfg.globals.get("web_browser") self.ec2.get_spot_history(instance_type, start, end, zone=self.opts.zone, plot=self.opts.plot, plot_web_browser=browser_cmd, vpc=self.opts.vpc, classic=self.opts.classic)
def __init__(self, remote_tzinfo=None): self.jobstat_cachesize = 200 self.hosts = [] self.jobs = [] self.queues = {} self.jobstats = self.jobstat_cachesize * [None] self.max_job_id = 0 self.remote_tzinfo = remote_tzinfo or utils.get_utc_now().tzinfo
def has_cluster_stabilized(self): now = utils.get_utc_now() elapsed = (now - self.__last_cluster_mod_time).seconds is_stabilized = not (elapsed < self.stabilization_time) if not is_stabilized: log.info("Cluster was modified less than %d seconds ago" % self.stabilization_time) log.info("Waiting for cluster to stabilize...") return is_stabilized
def __init__(self, interval=60, max_nodes=None, wait_time=900, add_pi=1, kill_after=45, stab=180, lookback_win=3, min_nodes=None, kill_cluster=False, plot_stats=False, plot_output_dir=None, dump_stats=False, stats_file=None): self._cluster = None self._keep_polling = True self._visualizer = None self._stat = None self.__last_cluster_mod_time = utils.get_utc_now() self.polling_interval = interval self.kill_after = kill_after self.longest_allowed_queue_time = wait_time self.add_nodes_per_iteration = add_pi self.stabilization_time = stab self.lookback_window = lookback_win self.kill_cluster = kill_cluster self.max_nodes = max_nodes self.min_nodes = min_nodes self.dump_stats = dump_stats self.stats_file = stats_file self.plot_stats = plot_stats self.plot_output_dir = plot_output_dir if plot_stats: assert self.visualizer is not None