def ev_close(self, worker): """ Check process termination status and generate appropriate events. Note that if fsck has correctly fixed some errors, actions will be considered as successful. """ if worker.did_timeout(): return FSAction.ev_close(self, worker) # We want to skip FSAction.ev_close(), just call the upper layer. Action.ev_close(self, worker) self.comp.lustre_check() # fsck returns 0=NOERROR, 1=OK_BUT_CORRECTION, 2=OK_BUT_REBOOT. # see man fsck. if worker.retcode() in (0, 1, 2, 4): # action succeeded result = Result(duration=self.duration, retcode=worker.retcode()) if worker.retcode() in (1, 2): result.message = "Errors corrected" if worker.retcode() == 4: # -n result.message = "Errors found but NOT corrected" self.comp.action_event(self, 'done', result) self.set_status(ACT_OK) else: # action failed msg = "\n".join(self._output) result = ErrorResult(msg, self.duration, worker.retcode()) self.comp.action_event(self, 'failed', result) self.set_status(ACT_ERROR)
def ev_read(self, worker): node = worker.current_node buf = worker.current_msg try: data = shine_msg_unpack(buf) # COMPAT: Prior to 1.4, 'comp'+'action' was used. # 1.4+ uses ActionInfo if 'comp' in data: action = Action() action.NAME = data.pop('action') comp = data.pop('comp') comp.fs = self.fs desc = "%s of %s" % (action.NAME, comp.longtext()) data['info'] = ActionInfo(action, comp, desc) evtype = 'comp' else: evtype = data.pop('evtype') self.fs.distant_event(evtype, node=node, **data) except ProxyActionUnpickleError, exp: # Maintain a standalone list of unpickling errors. # Node could have unpickling error but still exit with 0 msg = str(exp) if msg not in self._errpickle.get(node, ""): self._errpickle.add(node, msg)
def ev_close(self, worker): """ Check process termination status and set action status. """ Action.ev_close(self, worker) self.server.lustre_check() # Action timed out if worker.did_timeout(): self.server.action_event(self, 'timeout') self.set_status(ACT_ERROR) # Action succeeded elif worker.retcode() == 0: result = Result(duration=self.duration, retcode=worker.retcode()) self.server.action_event(self, 'done', result) self.set_status(ACT_OK) # Action failed else: result = ErrorResult(worker.read(), self.duration, worker.retcode()) self.server.action_event(self, 'failed', result) self.set_status(ACT_ERROR)
def ev_close(self, worker): """ Check process termination status and generate appropriate events. """ Action.ev_close(self, worker) # Action timed out if worker.did_timeout(): nodes = NodeSet.fromlist(worker.iter_keys_timeout()) self.fs._handle_shine_proxy_error(nodes, "Nodes timed out") self.set_status(ACT_ERROR) # Action succeeded elif max(rc for rc, _ in worker.iter_retcodes()) == 0: self.set_status(ACT_OK) # Action failed else: for rc, nodes in worker.iter_retcodes(): if rc == 0: continue # Avoid warnings, flag this component in error state for comp in self._comps or []: comp.sanitize_state(nodes=worker.nodes) for output, nodes in worker.iter_buffers(match_keys=nodes): nodes = NodeSet.fromlist(nodes) msg = "Copy failed: %s" % output self.fs._handle_shine_proxy_error(nodes, msg) self.set_status(ACT_ERROR)
def ev_read(self, worker): node = worker.current_node buf = worker.current_msg try: data = shine_msg_unpack(buf) # COMPAT: Prior to 1.4, 'comp'+'action' was used. # 1.4+ uses ActionInfo if 'comp' in data: action = Action() action.NAME = data.pop('action') comp = data.pop('comp') comp.fs = self.fs desc = "%s of %s" % (action.NAME, comp.longtext()) data['info'] = ActionInfo(action, comp, desc) evtype = 'comp' else: evtype = data.pop('evtype') self.fs.distant_event(evtype, node=node, **data) except ProxyActionUnpickleError as exp: # Maintain a standalone list of unpickling errors. # Node could have unpickling error but still exit with 0 msg = str(exp) if msg not in self._errpickle.get(node, ""): self._errpickle.add(node, msg) except AttributeError as exp: msg = "Cannot read message (check Shine and ClusterShell " \ "version): %s" % str(exp) if msg not in self._errpickle.get(node, ""): self._errpickle.add(node, msg) except ProxyActionUnpackError: # Store output that is not a shine message self._outputs.add(node, buf)
def ev_start(self, worker): Action.ev_start(self, worker) name = os.path.basename(self.config_file) if len(self.nodes) > 8: print "Updating configuration file `%s' on %d server(s)" % \ (name, len(self.nodes)) else: print "Updating configuration file `%s' on %s" % (name, self.nodes)
def ev_close(self, worker): """End of proxy command.""" Action.ev_close(self, worker) # Before all, we must check if shine command ran without bugs, node # crash, etc... # So we need to verify all node retcodes and change the component state # on the bad nodes. # Action timed out if worker.did_timeout(): self.set_status(ACT_ERROR) return status = ACT_OK # Remove the 'proxy' running action for each component. if self._comps: for comp in self._comps: # This special event helps to keep track of undergoing actions # (see ev_start()) comp.action_event(self, 'done') comp.sanitize_state(nodes=worker.nodes) # Gather nodes by return code for rc, nodes in worker.iter_retcodes(): # Remote command returns only RUNTIME_ERROR (See RemoteCommand) # some common remote errors: # rc 127 = command not found # rc 126 = found but not executable # rc 1 = python failure... if rc != 0: # If there is at least one error, the action is on error. status = ACT_ERROR # Gather these nodes by buffer key = nodes.__contains__ for buffers, nodes in self._outputs.walk(match=key): # Handle proxy command error nodes = NodeSet.fromlist(nodes) msg = "Remote action %s failed: %s\n" % \ (self.action, buffers) self.fs._handle_shine_proxy_error(nodes, msg) # Raise errors for each unpickling error, # which could happen mostly when Shine exits with 0. for buffers, nodes in self._errpickle.walk(): nodes = NodeSet.fromlist(nodes) self.fs._handle_shine_proxy_error(nodes, str(buffers)) # Raise an error for nodes without output if len(self._silentnodes) > 0: msg = "Remote action %s failed: No response" % self.action self.fs._handle_shine_proxy_error(self._silentnodes, msg) self.set_status(status)
def ev_close(self, worker): """ Check process termination status and generate appropriate events. """ Action.ev_close(self, worker) # Action timed out if worker.did_timeout(): self.set_status(ACT_ERROR) # Action succeeded elif max(rc for rc, _ in worker.iter_retcodes()) == 0: self.set_status(ACT_OK) # Action failed else: self.set_status(ACT_ERROR)
def __init__(self, nodes, fs, config_file): Action.__init__(self) self.nodes = nodes self.fs = fs self.config_file = config_file
def ev_close(self, worker): """End of proxy command.""" Action.ev_close(self, worker) # Before all, we must check if shine command ran without bugs, node # crash, etc... # So we need to verify all node retcodes and change the component state # on the bad nodes. # Action timed out if worker.did_timeout(): self.set_status(ACT_ERROR) return status = ACT_OK # Remove the 'proxy' running action for each component. if self._comps: for comp in self._comps: # XXX: This should be changed using a real event for proxy. comp._del_action('proxy') if comp.state is None: comp.state = RUNTIME_ERROR # At this step, there should be no more INPROGRESS component. # If yes, this is a bug, change state to RUNTIME_ERROR. # INPROGRESS management could be change using running action # list. # Starting with v1.3, there is no more code setting INPROGRESS. # This is for compatibility with older clients. elif comp.state == INPROGRESS: actions = "" if len(comp._list_action()): actions = "actions: " + ", ".join(comp._list_action()) print >> sys.stderr, "ERROR: bad state for %s: %d %s" % \ (comp.label, comp.state, actions) comp.state = RUNTIME_ERROR # Gather nodes by return code for rc, nodes in worker.iter_retcodes(): # Remote command returns only RUNTIME_ERROR (See RemoteCommand) # some common remote errors: # rc 127 = command not found # rc 126 = found but not executable # rc 1 = python failure... if rc != 0: # If there is at least one error, the action is on error. status = ACT_ERROR # Gather these nodes by buffer key = nodes.__contains__ for buffers, nodes in self._outputs.walk(match=key): # Handle proxy command error nodes = NodeSet.fromlist(nodes) msg = "Remote action %s failed: %s\n" % \ (self.action, buffers) self.fs._handle_shine_proxy_error(nodes, msg) # Raise errors for each unpickling error, # which could happen mostly when Shine exits with 0. for buffers, nodes in self._errpickle.walk(): nodes = NodeSet.fromlist(nodes) self.fs._handle_shine_proxy_error(nodes, str(buffers)) # Raise an error for nodes without output if len(self._silentnodes) > 0: msg = "Remote action %s failed: No response" % self.action self.fs._handle_shine_proxy_error(self._silentnodes, msg) self.set_status(status)