예제 #1
0
 def _eval_add_node(self):
     """
     This function inspects the current state of the SGE queue and decides
     whether or not to add nodes to the cluster. Returns the number of nodes
     to add.
     """
     num_nodes = len(self._cluster.nodes)
     if num_nodes >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     queued_jobs = self.stat.get_queued_jobs()
     if not queued_jobs and num_nodes >= self.min_nodes:
         log.info("Not adding nodes: at or above minimum nodes "
                  "and no queued jobs...")
         return
     total_slots = self.stat.count_total_slots()
     if not self.has_cluster_stabilized() and total_slots > 0:
         return
     running_jobs = self.stat.get_running_jobs()
     used_slots = sum([int(j['slots']) for j in running_jobs])
     qw_slots = sum([int(j['slots']) for j in queued_jobs])
     slots_per_host = self.stat.slots_per_host()
     avail_slots = total_slots - used_slots
     need_to_add = 0
     if num_nodes < self.min_nodes:
         log.info("Adding node: below minimum (%d)" % self.min_nodes)
         need_to_add = self.min_nodes - num_nodes
     elif total_slots == 0:
         # no slots, add one now
         need_to_add = 1
     elif qw_slots > avail_slots:
         log.info("Queued jobs need more slots (%d) than available (%d)" %
                  (qw_slots, avail_slots))
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d seconds "
                      "longer than max: %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             if slots_per_host != 0:
                 need_to_add = qw_slots / slots_per_host
             else:
                 need_to_add = 1
         else:
             log.info("No queued jobs older than %d seconds" %
                      self.longest_allowed_queue_time)
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(utils.get_utc_now())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = utils.get_utc_now()
             log.info("Done adding nodes at %s" %
                      str(self.__last_cluster_mod_time))
         except Exception:
             log.error("Failed to add new host", exc_info=True)
예제 #2
0
 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     num_nodes = len(self._cluster.nodes)
     if num_nodes <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)"
                  % self.min_nodes)
         return
     max_remove = num_nodes - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = utils.get_utc_now()
         except Exception:
             log.error("Failed to remove node %s" % node.alias,
                       exc_info=True)
예제 #3
0
 def get_all_stats(self):
     now = utils.get_utc_now()
     bits = []
     # first field is the time
     bits.append(now)
     # second field is the number of hosts
     bits.append(self.count_hosts())
     # third field is # of running jobs
     bits.append(len(self.get_running_jobs()))
     # fourth field is # of queued jobs
     bits.append(len(self.get_queued_jobs()))
     # fifth field is total # slots
     bits.append(self.count_total_slots())
     # sixth field is average job duration
     bits.append(self.avg_job_duration())
     # seventh field is average job wait time
     bits.append(self.avg_wait_time())
     # last field is array of loads for hosts
     arr = self.get_loads()
     # arr may be empty if there are no exec hosts
     if arr:
         load_sum = float(reduce(self._add, arr))
         avg_load = load_sum / len(arr)
     else:
         avg_load = 0.0
     bits.append(avg_load)
     return bits
예제 #4
0
 def execute(self, args):
     instance_types = ', '.join(sorted(static.INSTANCE_TYPES.keys()))
     if len(args) != 1:
         self.parser.error('please provide an instance type (options: %s)' %
                           instance_types)
     if self.opts.classic and self.opts.vpc:
         self.parser.error("options -c and -v cannot be specified at "
                           "the same time")
     instance_type = args[0]
     if instance_type not in static.INSTANCE_TYPES:
         self.parser.error('invalid instance type. possible options: %s' %
                           instance_types)
     start = self.opts.start_time
     end = self.opts.end_time
     if self.opts.days_ago:
         if self.opts.start_time:
             self.parser.error("options -d and -s cannot be specified at "
                               "the same time")
         if self.opts.end_time:
             end_tup = utils.iso_to_datetime_tuple(self.opts.end_time)
         else:
             end_tup = utils.get_utc_now()
         start = utils.datetime_tuple_to_iso(end_tup - timedelta(
             days=self.opts.days_ago))
     browser_cmd = self.cfg.globals.get("web_browser")
     self.ec2.get_spot_history(instance_type,
                               start,
                               end,
                               zone=self.opts.zone,
                               plot=self.opts.plot,
                               plot_web_browser=browser_cmd,
                               vpc=self.opts.vpc,
                               classic=self.opts.classic)
예제 #5
0
 def __init__(self, remote_tzinfo=None):
     self.jobstat_cachesize = 200
     self.hosts = []
     self.jobs = []
     self.queues = {}
     self.jobstats = self.jobstat_cachesize * [None]
     self.max_job_id = 0
     self.remote_tzinfo = remote_tzinfo or utils.get_utc_now().tzinfo
예제 #6
0
 def has_cluster_stabilized(self):
     now = utils.get_utc_now()
     elapsed = (now - self.__last_cluster_mod_time).seconds
     is_stabilized = not (elapsed < self.stabilization_time)
     if not is_stabilized:
         log.info("Cluster was modified less than %d seconds ago" %
                  self.stabilization_time)
         log.info("Waiting for cluster to stabilize...")
     return is_stabilized
예제 #7
0
 def __init__(self, interval=60, max_nodes=None, wait_time=900,
              add_pi=1, kill_after=45, stab=180, lookback_win=3,
              min_nodes=None, kill_cluster=False, plot_stats=False,
              plot_output_dir=None, dump_stats=False, stats_file=None):
     self._cluster = None
     self._keep_polling = True
     self._visualizer = None
     self._stat = None
     self.__last_cluster_mod_time = utils.get_utc_now()
     self.polling_interval = interval
     self.kill_after = kill_after
     self.longest_allowed_queue_time = wait_time
     self.add_nodes_per_iteration = add_pi
     self.stabilization_time = stab
     self.lookback_window = lookback_win
     self.kill_cluster = kill_cluster
     self.max_nodes = max_nodes
     self.min_nodes = min_nodes
     self.dump_stats = dump_stats
     self.stats_file = stats_file
     self.plot_stats = plot_stats
     self.plot_output_dir = plot_output_dir
     if plot_stats:
         assert self.visualizer is not None