def fetch_nodearray_definitions(self): ''' A wrapper around the autoscale library function to parse Configuration.autoscale.* chef attributes and add the 'ungrouped' attribute to the machine types. See cyclecloud.nodearrays.NodearrayDefinitions for more info. ''' nodearray_definitions = machine.fetch_nodearray_definitions( self.clusters_api, self.default_placement_attrs) nodearray_definitions.placement_group_optional = True for machinetype in nodearray_definitions: # ensure that any custom attribute the user specified, like disk = 100G, gets parsed correctly for key, value in machinetype.iteritems(): try: machinetype[key] = pbscc.parse_gb_size(key, value) except InvalidSizeExpressionError: pass # kludge: there is a strange bug where ungrouped is showing up as a string and not a boolean. if not machinetype.get("group_id"): machinetype["ungrouped"] = "true" else: machinetype["ungrouped"] = "false" machinetype["group_id"] = str( autoscale_util.uuid("ungrouped-")) return nodearray_definitions
def process_pbsnode(self, pbsnode, instance_ids_to_shutdown, nodearray_definitions): ''' If the pbsnode is offline, will handle evaluating whether the node can be shutdown. See instance_ids_to_shutdown, which is an OUT parameter here. Otherwise convert the pbsnode into a cyclecloud.machine.Machine instance. ''' states = set(pbsnode["state"].split(",")) resources = pbsnode["resources_available"] # host has incorrect case hostname = resources["vnode"] instance_id = resources.get("instance_id", autoscale_util.uuid("instanceid")) def try_shutdown_pbsnode(): if not instance_id: pbscc.error("instance_id was not defined for host %s, can not shut it down" % hostname) elif "down" in states: # don't immediately remove down nodes, give them time to recover from network failure. remove_down_nodes = float(self.cc_config.get("pbspro.remove_down_nodes", 300)) since_down = self.clock.time() - pbsnode["last_state_change_time"] if since_down > remove_down_nodes: pbscc.error("Removing down node %s after %.0f seconds", hostname, since_down) instance_ids_to_shutdown[instance_id] = hostname return True else: omega = remove_down_nodes - since_down pbscc.warn("Not removing down node %s for another %.0f seconds", hostname, omega) else: instance_ids_to_shutdown[instance_id] = hostname return True return False if "offline" in states: if not pbsnode.get("jobs", []): pbscc.fine("%s is offline and has no jobs, may be able to shut down" % hostname) if try_shutdown_pbsnode(): return else: pbscc.fine("Host %s is offline but still running jobs" % hostname) # if the node is just in the down state, try to shut it down. if set(["down"]) == states and try_shutdown_pbsnode(): return # just ignore complex down nodes (down,job-busy etc) until PBS decides to change the state. if "down" in states: return # convert relevant resources from bytes to floating point (GB) for key in resources: value = resources[key] if isinstance(value, basestring) and value.lower() in ["true", "false"]: value = value.lower() == "true" elif isinstance(value, list): # TODO will need to support this eventually continue else: try: resources[key] = pbscc.parse_gb_size(key, resources[key]) except InvalidSizeExpressionError: pass resources["hostname"] = hostname nodearray_name = resources.get("nodearray") or resources.get("slot_type") group_id = resources.get("group_id") if resources.get("machinetype") and nodearray_name: machinetype = nodearray_definitions.get_machinetype(nodearray_name, resources.get("machinetype"), group_id) else: # rely solely on resources_available pbscc.debug("machinetype is not defined for host %s, relying only on resources_available" % hostname) machinetype = {"availableCount": 1, "name": "undefined"} inst = machine.new_machine_instance(machinetype, **pbsnode["resources_available"]) return inst