def _first_heartbeat(self, sender, beat): node_id = beat.get('node_id') if not node_id: log.error("EE heartbeat from %s without a node_id!: %s", sender, beat) return node = self.store.get_node(node_id) if node is None: log.warn("EE heartbeat from unknown node. Still booting? " + "node_id=%s sender=%s.", node_id, sender) # TODO I'm thinking the best thing to do here is query EPUM # for the state of this node in case the initial node_state # update got lost. Note that we shouldn't go ahead and # schedule processes onto this EE until we get the RUNNING # node_state update -- there could be a failure later on in # the contextualization process that triggers the node to be # terminated. return if node.properties: properties = node.properties.copy() else: properties = {} log.info("First heartbeat from EEAgent %s on node %s (%s)", sender, node_id, properties.get("hostname", "unknown hostname")) try: engine_id = engine_id_from_domain(node.domain_id) except ValueError: log.exception("Node for EEagent %s has invalid domain_id!", sender) return engine_spec = self.get_engine(engine_id) slots = engine_spec.slots # just making engine type a generic property/constraint for now, # until it is clear something more formal is needed. properties['engine'] = engine_id try: self.node_add_resource(node, sender) except NotFoundError: log.warn("Node removed while processing heartbeat. ignoring. " "node_id=%s sender=%s.", node_id, sender) return timestamp_str = beat['timestamp'] timestamp = ceiling_datetime(parse_datetime(timestamp_str)) resource = ResourceRecord.new(sender, node_id, slots, properties) resource.new_last_heartbeat_datetime(timestamp) try: self.store.add_resource(resource) except WriteConflictError: # no problem if this resource was just created by another worker log.info("Conflict writing new resource record %s. Ignoring.", sender)
def test_heartbeat_timestamps(self): # test processing a heartbeat where node is removed partway through node_id = uuid.uuid4().hex self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING) d1 = parse_datetime("2013-04-02T19:37:57.617734+00:00") d2 = parse_datetime("2013-04-02T19:38:57.617734+00:00") d3 = parse_datetime("2013-04-02T19:39:57.617734+00:00") self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d1.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d1) self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d3.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d3) # out of order hbeat. time shouln't be updated self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d2.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d3)
def ee_heartbeat(self, sender, beat): """Incoming heartbeat from an EEAgent @param sender: ION name of sender @param beat: information about running processes @return: When an EEAgent starts, it immediately begins sending heartbeats to the PD. The first received heartbeat will trigger the PD to mark the EE as available in its slot tables, and potentially start deploying some WAITING process requests. The heartbeat message will consist of at least these fields: - node id - unique ID for the provisioned resource (VM) the EE runs on - timestamp - time heartbeat was generated - processes - list of running process IDs """ # sender can be in the format $sysname.$eename when CFG.dashi.sysname # is set, or it will be just $eename, if there is no sysname set. # We need to make sure that we remove the sysname when it is enabled to # get the correct eeagent name. if '.' in sender: sender = sender.split('.')[-1] resource = self.store.get_resource(sender) if resource is None: # first heartbeat from this EE self._first_heartbeat(sender, beat) return # *** EARLY RETURN ** resource_updated = False timestamp_str = beat['timestamp'] timestamp = ceiling_datetime(parse_datetime(timestamp_str)) resource_timestamp = resource.last_heartbeat_datetime if resource_timestamp is None or timestamp > resource_timestamp: resource.new_last_heartbeat_datetime(timestamp) resource_updated = True assigned_procs = set() processes = beat['processes'] node_exclusives_to_remove = [] for procstate in processes: upid = procstate['upid'] round = int(procstate['round']) state = procstate['state'] # TODO hack to handle how states are formatted in EEAgent heartbeat if isinstance(state, (list, tuple)): state = "-".join(str(s) for s in state) # TODO owner? process = self.store.get_process(None, upid) if not process: log.warn("EE reports process %s that is unknown!", upid) if state < ProcessState.TERMINATED: assigned_procs.add((None, upid, round)) else: self.eeagent_client.cleanup_process(sender, upid, round) continue if round < process.round: # skip heartbeat info for processes that are already redeploying # but send a cleanup request first self.eeagent_client.cleanup_process(sender, upid, round) continue if state == process.state: # if we managed to update the process record already for a # terminated process but didn't update the resource record, # clean up the process if state >= ProcessState.TERMINATED: self.eeagent_client.cleanup_process(sender, upid, round) continue if process.state == ProcessState.PENDING and \ state == ProcessState.RUNNING: assigned_procs.add(process.key) # mark as running and notify subscriber process, changed = self.process_change_state( process, ProcessState.RUNNING) elif state in (ProcessState.TERMINATED, ProcessState.FAILED, ProcessState.EXITED): # process has died in resource. Obvious culprit is that it was # killed on request. if process.node_exclusive: node_exclusives_to_remove.append(process.node_exclusive) if process.state == ProcessState.TERMINATING: # mark as terminated and notify subscriber process, updated = self.process_change_state( process, ProcessState.TERMINATED, assigned=None) # otherwise it may need to be rescheduled elif process.state in (ProcessState.PENDING, ProcessState.RUNNING): if self.process_should_restart(process, state): self.process_next_round(process) else: self.process_change_state(process, state, assigned=None) # send cleanup request to EEAgent now that we have dealt # with the dead process self.eeagent_client.cleanup_process(sender, upid, round) new_assigned = [] for owner, upid, round in resource.assigned: key = (owner, upid, round) process = self.store.get_process(owner, upid) if key in assigned_procs: new_assigned.append(key) # prune process assignments once the process has terminated or # moved onto the next round elif (process and process.round == round and process.state < ProcessState.TERMINATED): new_assigned.append(key) if len(new_assigned) != len(resource.assigned): # first update node exclusive tags if node_exclusives_to_remove: node = self.store.get_node(resource.node_id) if node: self.node_remove_exclusive_tags(node, node_exclusives_to_remove) else: log.warning("Node %s not found while attempting to update node_exclusive", resource.node_id) if log.isEnabledFor(logging.DEBUG): old_assigned_set = set(tuple(item) for item in resource.assigned) new_assigned_set = set(tuple(item) for item in new_assigned) difference_message = get_set_difference_debug_message( old_assigned_set, new_assigned_set) log.debug("updating resource %s assignments: %s", resource.resource_id, difference_message) resource.assigned = new_assigned resource_updated = True if resource_updated: try: self.store.update_resource(resource) except (WriteConflictError, NotFoundError): # TODO? right now this will just wait for the next heartbeat pass