def _hook(): pbscc.set_application_name("cycle_autoscale") # allow local overrides of jetpack.config or allow non-jetpack masters to define the complete set of settings. overrides = {} if os.path.exists(pbscc.CONFIG_PATH): try: pbscc.warn("overrides exist in file %s" % pbscc.CONFIG_PATH) with open(pbscc.CONFIG_PATH) as fr: overrides = json.load(fr) except Exception: pbscc.error(traceback.format_exc()) sys.exit(1) else: pbscc.debug("No overrides exist in file %s" % pbscc.CONFIG_PATH) cc_config = cyclecloud.config.new_provider_config(overrides=overrides) if len(sys.argv) < 3: # There are no env variables for this as far as I can tell. bin_dir = "/opt/pbs/bin" else: bin_dir = sys.argv[2] if not os.path.isdir(bin_dir): bin_dir = os.path.dirname(bin_dir) clusters_api = clustersapi.ClustersAPI(cc_config.get("cyclecloud.cluster.name"), cc_config) autostart = PBSAutostart(pbs_driver.PBSDriver(bin_dir), clusters_api, cc_config=cc_config) autostart.autoscale()
def try_shutdown_pbsnode(): if not instance_id: pbscc.error( "instance_id was not defined for host %s, can not shut it down" % hostname) elif "down" in states: # don't immediately remove down nodes, give them time to recover from network failure. remove_down_nodes = float( self.cc_config.get("pbspro.remove_down_nodes", 300)) since_down = self.clock.time( ) - pbsnode["last_state_change_time"] if since_down > remove_down_nodes: pbscc.error("Removing down node %s after %.0f seconds", hostname, since_down) instance_ids_to_shutdown[instance_id] = hostname return True else: omega = remove_down_nodes - since_down pbscc.warn( "Not removing down node %s for another %.0f seconds", hostname, omega) else: instance_ids_to_shutdown[instance_id] = hostname return True return False
def autoscale(self): ''' The main loop described at the top of this class. Returns machine_requests, idle_machines and total_machines for ease of unit testing. ''' pbscc.info("Begin autoscale cycle") nodearray_definitions = self.fetch_nodearray_definitions() pbsnodes_by_hostname, existing_machines, booting_instance_ids, instance_ids_to_shutdown = self.get_existing_machines(nodearray_definitions) start_enabled = "true" == str(self.cc_config.get("cyclecloud.cluster.autoscale.start_enabled", "true")).lower() if not start_enabled: pbscc.warn("cyclecloud.cluster.autoscale.start_enabled is false, new machines will not be allocated.") autoscaler = autoscalerlib.Autoscaler(nodearray_definitions, existing_machines, self.default_placement_attrs, start_enabled) # throttle how many jobs we attempt to match. When pbspro.compress_jobs is true (default) this shouldn't really be an issue # unless the user has over $pbspro.max_unmatched_jobs unique sets of requirements. max_unmatched_jobs = int(self.cc_config.get("pbspro.max_unmatched_jobs", 10000)) unmatched_jobs = 0 for job in self.query_jobs(): if job.executing_hostname: try: autoscaler.get_machine(hostname=job.executing_hostname).add_job(job, force=True) continue except RuntimeError as e: pbscc.error(str(e)) pass if not autoscaler.add_job(job): unmatched_jobs += 1 pbscc.info("Can not match job %s." % job.name) if max_unmatched_jobs > 0 and unmatched_jobs >= max_unmatched_jobs: pbscc.warn('Maximum number of unmatched jobs reached - %s. To configure this setting, change {"pbspro": "max_unmatched_jobs": N}} in %s' % (unmatched_jobs, pbscc.CONFIG_PATH)) break machine_requests = autoscaler.get_new_machine_requests() idle_machines = autoscaler.get_idle_machines() autoscale_request = autoscale_util.create_autoscale_request(machine_requests) for request_set in autoscale_request["sets"]: configuration = request_set["nodeAttributes"]["Configuration"] if "pbspro" not in configuration: configuration["pbspro"] = {} configuration["pbspro"]["slot_type"] = request_set["nodearray"] if not request_set.get("placementGroupId"): configuration["pbspro"]["is_grouped"] = False else: configuration["pbspro"]["is_grouped"] = True autoscale_util.scale_up(self.clusters_api, autoscale_request) for r in machine_requests: if r.placeby_value: pbscc.info("Requesting %d %s machines in placement group %s for nodearray %s" % (r.instancecount, r.machinetype, r.placeby_value, r.nodearray)) else: pbscc.info("Requesting %d %s machines in nodearray %s" % (r.instancecount, r.machinetype, r.nodearray)) if pbscc.is_fine(): pbscc.fine("New target state of the cluster, including booting nodes:") for m in autoscaler.machines: pbscc.fine(" %s" % str(m)) if instance_ids_to_shutdown: pbscc.info("Shutting down instance ids %s" % instance_ids_to_shutdown.keys()) self.clusters_api.shutdown(instance_ids_to_shutdown.keys()) for hostname in instance_ids_to_shutdown.itervalues(): pbscc.info("Deleting %s" % hostname) self.driver.delete_host(hostname) now = self.clock.time() stop_enabled = "true" == str(self.cc_config.get("cyclecloud.cluster.autoscale.stop_enabled", "true")).lower() if not stop_enabled: pbscc.warn("cyclecloud.cluster.autoscale.stop_enabled is false, idle machines will not be terminated") if stop_enabled: idle_before_threshold = float(self.cc_config.get("cyclecloud.cluster.autoscale.idle_time_before_jobs", 3600)) idle_after_threshold = float(self.cc_config.get("cyclecloud.cluster.autoscale.idle_time_after_jobs", 300)) for m in idle_machines: if m.get_attr("instance_id", "") not in booting_instance_ids: pbscc.debug("Could not find instance id in CycleCloud %s" % m.get_attr("instance_id", "")) continue pbsnode = pbsnodes_by_hostname.get(m.hostname) # the machine may not have converged yet, so if pbsnode: if "busy" in pbsnode["state"]: if "down" in pbsnode["state"]: pbscc.warn("WARNING: %s is down but busy with jobs %s", m.hostname, pbsnode.get("jobs", [])) else: pbscc.error("WARNING: Falsely determined that %s is idle!" % m.hostname) continue last_state_change_time = pbsnode["last_state_change_time"] last_used_time = pbsnode.get("last_used_time") if last_used_time: # last_used_time can be stale while a job is exiting, e.g. last_state_change_time could be < 5 minutes but # somehow last_used_time > 5 minutes, causing us to prematurely terminate the node just because a job took a long time # to exit. last_used_time = max(last_state_change_time, last_used_time) else: last_used_time = self.clock.time() if now - last_used_time > idle_after_threshold: pbscc.info("Setting %s offline after %s seconds" % (m.hostname, now - last_used_time)) self.driver.set_offline(m.hostname) elif now - last_state_change_time > idle_before_threshold: pbscc.info("Setting %s offline after %s seconds" % (m.hostname, now - last_state_change_time)) self.driver.set_offline(m.hostname) pbscc.info("End autoscale cycle") # returned for testing purposes return machine_requests, idle_machines, autoscaler.machines