def get_existing_machines(self, nodearray_definitions): ''' Queries pbsnodes and CycleCloud to get a sane set of cyclecloud.machine.Machine instances that represent the current state of the cluster. ''' pbsnodes = self.driver.pbsnodes().get(None) existing_machines = [] booting_instance_ids = autoscale_util.nodes_by_instance_id( self.clusters_api, nodearray_definitions) instance_ids_to_shutdown = Record() nodes_by_instance_id = Record() for pbsnode in pbsnodes.values(): inst = self.process_pbsnode(pbsnode, instance_ids_to_shutdown, nodearray_definitions) if not inst: continue existing_machines.append(inst) # we found the pbsnode that matches the cyclecloud node, so let's remove the duplicate instance_id = inst.get_attr("instance_id", "") if instance_id in booting_instance_ids: booting_instance_ids.pop(instance_id) nodes_by_instance_id[instance_id] = pbsnode for instance_id, node in list(booting_instance_ids.iteritems()): nodearray_name = node["Template"] machinetype_name = node["MachineType"] try: machinetype = nodearray_definitions.get_machinetype( nodearray_name, machinetype_name, node.get("PlacementGroupId")) except KeyError as e: raise ValueError( "machine is %s, key is %s, rest is %s" % (nodearray_name, str(e), nodearray_definitions)) inst = machine.new_machine_instance( machinetype, hostname=node.get("hostname"), instance_id=node.get("InstanceId"), group_id=node.get("placementGroupId")) existing_machines.append(inst) nodes_by_instance_id[instance_id] = node return pbsnodes, existing_machines, nodes_by_instance_id, instance_ids_to_shutdown
def get_existing_machines(self, nodearray_definitions): ''' Queries pbsnodes and CycleCloud to get a sane set of cyclecloud.machine.Machine instances that represent the current state of the cluster. ''' pbsnodes = self.driver.pbsnodes().get(None) existing_machines = [] booting_instance_ids = autoscale_util.nodes_by_instance_id(self.clusters_api, nodearray_definitions) instance_ids_to_shutdown = Record() nodes_by_instance_id = Record() for pbsnode in pbsnodes.values(): instance_id = pbsnode["resources_available"].get("instance_id", "") # use this opportunity to set some things that can change during the runtime (keepalive) or are not always set # by previous versions (machinetype/nodearray) if instance_id and booting_instance_ids.get(instance_id): node = booting_instance_ids.get(instance_id) pbsnode["resources_available"]["keep_alive"] = node.get("KeepAlive", False) pbsnode["resources_available"]["machinetype"] = pbsnode["resources_available"].get("machinetype") or node.get("MachineType") pbsnode["resources_available"]["nodearray"] = pbsnode["resources_available"].get("nodearray") or node.get("Template") inst = self.process_pbsnode(pbsnode, instance_ids_to_shutdown, nodearray_definitions) if not inst: continue existing_machines.append(inst) # we found the pbsnode that matches the cyclecloud node, so let's remove the duplicate instance_id = inst.get_attr("instance_id", "") if instance_id in booting_instance_ids: booting_instance_ids.pop(instance_id) nodes_by_instance_id[instance_id] = pbsnode for instance_id, node in list(booting_instance_ids.items()): nodearray_name = node["Template"] machinetype_name = node["MachineType"] try: machinetype = nodearray_definitions.get_machinetype(nodearray_name, machinetype_name, node.get("PlacementGroupId")) except KeyError as e: raise ValueError("machine is %s, key is %s, rest is %s" % (nodearray_name, str(e), nodearray_definitions)) inst = machine.new_machine_instance(machinetype, hostname=node.get("hostname"), instance_id=node.get("InstanceId"), group_id=node.get("placementGroupId"), keep_alive=node.get("KeepAlive", False)) existing_machines.append(inst) nodes_by_instance_id[instance_id] = node return pbsnodes, existing_machines, nodes_by_instance_id, instance_ids_to_shutdown
def process_pbsnode(self, pbsnode, instance_ids_to_shutdown, nodearray_definitions): ''' If the pbsnode is offline, will handle evaluating whether the node can be shutdown. See instance_ids_to_shutdown, which is an OUT parameter here. Otherwise convert the pbsnode into a cyclecloud.machine.Machine instance. ''' states = set(pbsnode["state"].split(",")) resources = pbsnode["resources_available"] # host has incorrect case hostname = resources["vnode"] instance_id = resources.get("instance_id", autoscale_util.uuid("instanceid")) def try_shutdown_pbsnode(): if not instance_id: pbscc.error("instance_id was not defined for host %s, can not shut it down" % hostname) elif "down" in states: # don't immediately remove down nodes, give them time to recover from network failure. remove_down_nodes = float(self.cc_config.get("pbspro.remove_down_nodes", 300)) since_down = self.clock.time() - pbsnode["last_state_change_time"] if since_down > remove_down_nodes: pbscc.error("Removing down node %s after %.0f seconds", hostname, since_down) instance_ids_to_shutdown[instance_id] = hostname return True else: omega = remove_down_nodes - since_down pbscc.warn("Not removing down node %s for another %.0f seconds", hostname, omega) else: instance_ids_to_shutdown[instance_id] = hostname return True return False if "offline" in states: if not pbsnode.get("jobs", []): pbscc.fine("%s is offline and has no jobs, may be able to shut down" % hostname) if try_shutdown_pbsnode(): return else: pbscc.fine("Host %s is offline but still running jobs" % hostname) # if the node is just in the down state, try to shut it down. if set(["down"]) == states and try_shutdown_pbsnode(): return # just ignore complex down nodes (down,job-busy etc) until PBS decides to change the state. if "down" in states: return # convert relevant resources from bytes to floating point (GB) for key in resources: value = resources[key] if isinstance(value, basestring) and value.lower() in ["true", "false"]: value = value.lower() == "true" elif isinstance(value, list): # TODO will need to support this eventually continue else: try: resources[key] = pbscc.parse_gb_size(key, resources[key]) except InvalidSizeExpressionError: pass resources["hostname"] = hostname nodearray_name = resources.get("nodearray") or resources.get("slot_type") group_id = resources.get("group_id") if resources.get("machinetype") and nodearray_name: machinetype = nodearray_definitions.get_machinetype(nodearray_name, resources.get("machinetype"), group_id) else: # rely solely on resources_available pbscc.debug("machinetype is not defined for host %s, relying only on resources_available" % hostname) machinetype = {"availableCount": 1, "name": "undefined"} inst = machine.new_machine_instance(machinetype, **pbsnode["resources_available"]) return inst