def run(self, cluster): """ This is a rough looping function. it will loop indefinitely, using SGELoadBalancer.get_stats() to get the clusters status. It will look at the job queue and try to decide whether to add or remove a node. It should later look at job durations. Doesn't yet. """ self._cluster = cluster if not cluster.is_cluster_up(): raise exception.ClusterNotRunning(cluster.cluster_tag) while (self._keep_polling): if not cluster.is_cluster_up(): log.info("Entire cluster is not up, nodes added/removed. " + \ "No Action.") time.sleep(self.polling_interval) continue if self.get_stats() == -1: log.error("Failed to get stats. LoadBalancer is terminating.") return log.info( "Oldest job is from %s. # queued jobs = %d. # hosts = %d." % (self.stat.oldest_queued_job_age(), len(self.stat.get_queued_jobs()), len(self.stat.hosts))) log.info("Avg job duration = %d sec, Avg wait time = %d sec." % (self.stat.avg_job_duration(), self.stat.avg_wait_time())) #evaluate if nodes need to be added self._eval_add_node() #evaluate if nodes need to be removed self._eval_remove_node() #call the visualizer self._call_visualizer() #sleep for the specified number of seconds log.info("Sleeping, looping again in %d seconds.\n" % self.polling_interval) time.sleep(self.polling_interval)
def run(self, cluster): """ This function will loop indefinitely, using SGELoadBalancer.get_stats() to get the clusters status. It looks at the job queue and tries to decide whether to add or remove a node. It should later look at job durations (currently doesn't) """ self._cluster = cluster if self.max_nodes is None: self.max_nodes = cluster.cluster_size if self.min_nodes is None: self.min_nodes = 1 if self.kill_cluster: self.min_nodes = 0 if self.min_nodes > self.max_nodes: raise exception.BaseException( "min_nodes cannot be greater than max_nodes") use_default_stats_file = self.dump_stats and not self.stats_file use_default_plots_dir = self.plot_stats and not self.plot_output_dir if use_default_stats_file or use_default_plots_dir: self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True) if not self.stats_file: self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag if not self.plot_output_dir: self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag if not cluster.is_cluster_up(): raise exception.ClusterNotRunning(cluster.cluster_tag) if self.dump_stats: if os.path.isdir(self.stats_file): raise exception.BaseException("stats file destination '%s' is" " a directory" % self.stats_file) sfdir = os.path.dirname(os.path.abspath(self.stats_file)) self._validate_dir(sfdir, msg_prefix="stats file destination") if self.plot_stats: if os.path.isfile(self.plot_output_dir): raise exception.BaseException("plot output destination '%s' " "is a file" % self.plot_output_dir) self._validate_dir(self.plot_output_dir, msg_prefix="plot output destination") raw = dict(__raw__=True) log.info("Starting load balancer (Use ctrl-c to exit)") log.info("Maximum cluster size: %d" % self.max_nodes, extra=raw) log.info("Minimum cluster size: %d" % self.min_nodes, extra=raw) log.info("Cluster growth rate: %d nodes/iteration\n" % self.add_nodes_per_iteration, extra=raw) if self.dump_stats: log.info("Writing stats to file: %s" % self.stats_file) if self.plot_stats: log.info("Plotting stats to directory: %s" % self.plot_output_dir) while (self._keep_polling): cluster.recover(reboot_interval=self.reboot_interval, n_reboot_restart=self.n_reboot_restart) cluster.clean() if not cluster.is_cluster_up(): log.info("Waiting for all nodes to come up...") time.sleep(self.polling_interval) continue self.get_stats() log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw) log.info("Execution slots: %d" % self.stat.count_total_slots(), extra=raw) log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()), extra=raw) oldest_queued_job_age = self.stat.oldest_queued_job_age() if oldest_queued_job_age: log.info("Oldest queued job: %s" % oldest_queued_job_age, extra=raw) log.info("Avg job duration: %d secs" % self.stat.avg_job_duration(), extra=raw) log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(), extra=raw) log.info("Last cluster modification time: %s" % self.__last_cluster_mod_time.isoformat(), extra=dict(__raw__=True)) # evaluate if nodes need to be added skip_sleep = self._eval_add_node() # evaluate if nodes need to be removed self._eval_remove_node() if self.dump_stats or self.plot_stats: self.stat.write_stats_to_csv(self.stats_file) # call the visualizer if self.plot_stats: try: self.visualizer.graph_all() except IOError, e: raise exception.BaseException(str(e)) # evaluate if cluster should be terminated if self.kill_cluster: if self._eval_terminate_cluster(): log.info("Terminating cluster and exiting...") return self._cluster.terminate_cluster() if not skip_sleep: log.info("Sleeping...(looping again in %d secs)\n" % self.polling_interval) time.sleep(self.polling_interval)