def recover(self, name, vm_list, partial_failure): """ Try to recover a node from a failure, by migrating or re-starting vm. name - (String) Name of the failed node. vm_list - (List of strings) List of the vm names running on the failed node. partial_failure - (bool) True if contact with failed node is not fully lost. Return True if recover is successfull Return False if recover cannot be done (i.e. service still alive) Raise an Exception if something goes wrong. """ assert type(name) == str, "Param 'name' should be a string." assert type(vm_list) == list, "Param 'vm_list' should be a list." assert type(partial_failure) == bool, "Param 'partial_failure' should be a bool." log.info("Trying to recover", name, "...") try: # Try to get VM back on alive nodes # If a vm is paused, eject will fail self.emergency_eject(self.get_node(name)) log.info("VM from %s successfully migrated on healthy nodes." % (name)) # Eject successfull, fence the node try: log.info("Fencing useless node %s ..." % (name)) self.get_local_node().fence(name) except Exception, e: # If fencing fail, this is not a big deal, VM are alive log.err("Fencing of %s failed:" % (name), e) return True # Succeeded !
def checkSlaveHeartbeats(self): # Checks slaves timestamps only if we are active master if self.role != MasterService.RL_ACTIVE: return # No failover in panic mode if self.state == MasterService.ST_PANIC: return # No more failover if a recovery is running if self.state == MasterService.ST_RECOVERY: return # No failover if we are alone if len(self.status) <= 1: return # Check net heartbeat netFailed=Set() for name, values in self.status.items(): if values['timestamp'] == 0: # Do nothing if first heartbeat has not been received yet continue if values['timestamp']+MasterService.TM_SLAVE <= int(time.time()): log.warn("Net heartbeat lost for %s." % (name)) netFailed.add(name) # Get diskhearbeat timestamps try: tsDisk=self.disk.get_all_ts() except Exception, e: log.err("Diskheartbeat read failed: %s." % (e)) raise
def checkSlaveHeartbeats(self): # Checks slaves timestamps only if we are active master if self.role != MasterService.RL_ACTIVE: return # No failover in panic mode if self.state == MasterService.ST_PANIC: return # No more failover if a recovery is running if self.state == MasterService.ST_RECOVERY: return # No failover if we are alone if len(self.status) <= 1: return # Check net heartbeat netFailed = Set() for name, values in self.status.items(): if values['timestamp'] == 0: # Do nothing if first heartbeat has not been received yet continue if values['timestamp'] + MasterService.TM_SLAVE <= int( time.time()): log.warn("Net heartbeat lost for %s." % (name)) netFailed.add(name) # Get diskhearbeat timestamps try: tsDisk = self.disk.get_all_ts() except Exception, e: log.err("Diskheartbeat read failed: %s." % (e)) raise
def joinCluster(self): def startHeartbeats(): self._startSlave() self.s_rpc.startService() if self.role == MasterService.RL_ACTIVE: self._startMaster() def joinRefused(reason): reason.trap(NodeRefusedError, RPCRefusedError) log.err("Join to cluster %s failed: Master %s has refused me: %s" % (core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage())) self.stopService() def joinAccepted(result): self.role=MasterService.RL_PASSIVE log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME'])) startHeartbeats() def masterConnected(obj): d = obj.callRemote("register",DNSCache.getInstance().name) d.addCallbacks(joinAccepted,joinRefused) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d try: if self.master is None: # New active master if DNSCache.getInstance().name not in core.cfg['ALLOWED_NODES']: log.warn("I'm not allowed to create a new cluster. Exiting.") raise Exception("Cluster creation not allowed") if DiskHeartbeat.is_in_use(): log.err("Heartbeat disk is in use but we are alone !") raise Exception("Heartbeat disk already in use") log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME'])) self.role=MasterService.RL_ACTIVE self.master=DNSCache.getInstance().name self.status[self.master]={'timestamp': 0, 'offset': 0, 'vms': []} self.disk.make_slot(DNSCache.getInstance().name) startHeartbeats() else: # Passive master self.role=MasterService.RL_JOINING log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME'])) factory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory) d = factory.getRootObject() d.addCallback(masterConnected) d.addErrback(log.err) except Exception, e: log.err("Startup failed: %s. Shutting down." % (e)) self.stopService()
def recoverSucceeded(result, name): # result is the return code from XenCluster.recover() # If True: success, if False: maybe a partition if (result): log.info("Successfully recovered node %s." % (name)) self._unregister(name) else: log.err("Partial failure, cannot recover", name)
def recoverSucceeded(result, name): # result is the return code from XenCluster.recover() # If True: success, if False: maybe a partition if(result): log.info("Successfully recovered node %s." % (name)) self._unregister(name) else: log.err("Partial failure, cannot recover", name)
def dispatchMessage(self, data, host): dispatcher = { "slavehb": self.updateNodeStatus, "masterhb": self.updateMasterStatus, "voterequest": self.voteForNewMaster, "voteresponse": self.recordVote, } try: msg = MessageHelper.get(data, host) log.debugd("Received", msg) dispatcher[msg.type()](msg) except (MessageError, KeyError), e: log.err("Bad message from %s : %s , %s" % (host, data, e))
def _sendError(self, reason): # Log all stacktrace to view the origin of this error log.err("Netheartbeat failure: %s" % (reason)) if self.retry >= self.MAX_RETRY: log.emerg("Too many retry. Asking master to engage panic mode.") # Engage panic mode agent = Agent() d = agent.panic() d.addErrback(log.err) d.addBoth(lambda x: agent.disconnect()) else: log.warn("Restarting network heartbeat within a few seconds...") self.retry += 1 # Will be resetted each elections (or panic recovery) reactor.callLater(2, self._run, self._proto)
def dispatchMessage(self, data, host): dispatcher = { "slavehb" : self.updateNodeStatus, "masterhb" : self.updateMasterStatus, "voterequest" : self.voteForNewMaster, "voteresponse" : self.recordVote, } try: msg=MessageHelper.get(data, host) log.debugd("Received", msg) dispatcher[msg.type()](msg) except (MessageError, KeyError), e: log.err("Bad message from %s : %s , %s" % (host,data,e))
def _sendError(self, reason): # Log all stacktrace to view the origin of this error log.err("Netheartbeat failure: %s" % (reason)) if self.retry >= self.MAX_RETRY: log.emerg("Too many retry. Asking master to engage panic mode.") # Engage panic mode agent=Agent() d=agent.panic() d.addErrback(log.err) d.addBoth(lambda x: agent.disconnect()) else: log.warn("Restarting network heartbeat within a few seconds...") self.retry+=1 # Will be resetted each elections (or panic recovery) reactor.callLater(2, self._run, self._proto)
def startService(self): def standalone(reason): log.info("Starting in standalone mode.") self.agent=None def cluster(result): log.info("Starting in cluster mode.") Service.startService(self) msg=self.node.run("svn status "+core.cfg['VMCONF_DIR'] +" 2>&1").read() if len(msg)>0: log.err("Your repo is not clean. Please check it : %s" % (msg)) raise Exception("SVN repo not clean") d=self.agent.ping() d.addCallbacks(cluster, standalone) d.addBoth(lambda _: self.spawnInotify()) d.addErrback(log.err) return d
def load_cfg(): """Load the global configuration file into the cfg dict.""" type_map = { str: "a string", bool: "a boolean", int: "an integer", list: "a list", } try: execfile("/etc/xen/cxm.conf",dict(),cfg) # Check type of configuration entries for key in cfg_type.keys(): if cfg[key]: assert type(cfg[key]) == cfg_type[key], "%s should be %s." % (key, type_map[cfg_type[key]]) except Exception,e: log.err("Configuration file error:", e) sys.exit(e)
def startService(self): def standalone(reason): log.info("Starting in standalone mode.") self.agent = None def cluster(result): log.info("Starting in cluster mode.") Service.startService(self) msg = self.node.run("svn status " + core.cfg['VMCONF_DIR'] + " 2>&1").read() if len(msg) > 0: log.err("Your repo is not clean. Please check it : %s" % (msg)) raise Exception("SVN repo not clean") d = self.agent.ping() d.addCallbacks(cluster, standalone) d.addBoth(lambda _: self.spawnInotify()) d.addErrback(log.err) return d
def load_cfg(): """Load the global configuration file into the cfg dict.""" type_map = { str: "a string", bool: "a boolean", int: "an integer", list: "a list", } try: execfile("/etc/xen/cxm.conf", dict(), cfg) # Check type of configuration entries for key in cfg_type.keys(): if cfg[key]: assert type(cfg[key]) == cfg_type[key], "%s should be %s." % ( key, type_map[cfg_type[key]]) except Exception, e: log.err("Configuration file error:", e) sys.exit(e)
try: log.info("Fencing useless node %s ..." % (name)) self.get_local_node().fence(name) except Exception, e: # If fencing fail, this is not a big deal, VM are alive log.err("Fencing of %s failed:" % (name), e) return True # Succeeded ! except NotEnoughRamError: # Engage panic mode raise except NotInClusterError: # Next step of recovery process pass except Exception, e: log.err("Cannot get the VMs back:", e) if partial_failure: # Cannot recover, node still alive return False # Check if VM are still alive if len(vm_list)>0: for node in self.get_nodes(): if node.ping(vm_list): log.warn("Some VM on %s are still alive !" % (name)) return False log.warn("All VM on %s are dead. Fencing now !" % (name)) else: log.warn("No VM running on %s. Fencing now !" % (name))
def commitFailed(reason): log.err("SVN failed: %s" % reason.getErrorMessage())
def heartbeatFailed(reason): log.err("Disk heartbeat failure: %s." % (reason.getErrorMessage())) self.stopService() # Stop slave heartbeat to tell master we have a problem
return d def _unregister(self, name): try: del self.status[name] except: pass try: self.disk.erase_slot(name) except DiskHeartbeatError, e: log.warn( "Cannot erase slot: %s. You may have to reformat hearbeat disk." % (e)) except Exception, e: log.err("Diskheartbeat failure: %s." % (e)) self.panic() DNSCache.getInstance().delete(name) log.info("Node %s has been unregistered." % (name)) def unregisterNode(self, name): # Can unregister node even if in panic mode if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot unregister %s." % (name)) raise RPCRefusedError("Not master") if name not in self.status: log.warn("Unknown node %s try to quit the cluster." % (name)) raise NodeRefusedError("Unknown node " + name)
#!/usr/bin/env python2 import sys, os import vars_init vars_init.init_no_state() import paths from logs import err if len(sys.argv[1:]) != 1: err('%s: exactly one argument expected.\n' % sys.argv[0]) sys.exit(1) want = sys.argv[1] if not want: err('cannot build the empty target ("").\n') sys.exit(204) abswant = os.path.abspath(want) for dodir, dofile, basedir, basename, ext in paths.possible_do_files(abswant): dopath = os.path.join('/', dodir, dofile) relpath = os.path.relpath(dopath, '.') exists = os.path.exists(dopath) assert ('\n' not in relpath) print relpath if exists: sys.exit(0) sys.exit(1) # no appropriate dofile found
def joinCluster(self): def startHeartbeats(): self._startSlave() self.s_rpc.startService() if self.role == MasterService.RL_ACTIVE: self._startMaster() def joinRefused(reason): reason.trap(NodeRefusedError, RPCRefusedError) log.err("Join to cluster %s failed: Master %s has refused me: %s" % (core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage())) self.stopService() def joinAccepted(result): self.role = MasterService.RL_PASSIVE log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME'])) startHeartbeats() def masterConnected(obj): d = obj.callRemote("register", DNSCache.getInstance().name) d.addCallbacks(joinAccepted, joinRefused) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d try: if self.master is None: # New active master if DNSCache.getInstance( ).name not in core.cfg['ALLOWED_NODES']: log.warn( "I'm not allowed to create a new cluster. Exiting.") raise Exception("Cluster creation not allowed") if DiskHeartbeat.is_in_use(): log.err("Heartbeat disk is in use but we are alone !") raise Exception("Heartbeat disk already in use") log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME'])) self.role = MasterService.RL_ACTIVE self.master = DNSCache.getInstance().name self.status[self.master] = { 'timestamp': 0, 'offset': 0, 'vms': [] } self.disk.make_slot(DNSCache.getInstance().name) startHeartbeats() else: # Passive master self.role = MasterService.RL_JOINING log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME'])) factory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory) d = factory.getRootObject() d.addCallback(masterConnected) d.addErrback(log.err) except Exception, e: log.err("Startup failed: %s. Shutting down." % (e)) self.stopService()
#关闭系统的报错 os.close(sys.stderr.fileno()) if __name__ == '__main__': try: # 打印版权信息,注释这条前大家加下群吧,或收藏一下我的博客也可以,谢谢大家(〃'▽'〃) copyright.main() # 各项准备工作检查 try: p = pyaudio.PyAudio() print( '\033[1;32m ################## 声卡驱动加载成功! ################### \033[0m' ) except: err = logs.err() print( '\033[1;31m ############ 声卡驱动加载失败!请检查声卡驱动 ############# \033[0m' ) exit() try: # 普通情况下直接init就可以了 # 但我的声卡播放出来声卡有些怪异,所以这里调下频率来解决问题 pygame.mixer.init(frequency=15500, size=-16, channels=4) print( '\033[1;32m ################## 播放功能加载成功! ################### \033[0m' ) except: err = logs.err() print( '\033[1;31m ############ 播放功能加载失败!请检查声卡驱动 ############# \033[0m'
else: f = me = None debug2('redo-ifchange: not adding depends.\n') jwack.setup(1) try: targets = sys.argv[1:] if f: for t in targets: f.add_dep('m', t) f.save() state.commit() rv = builder.main(targets, should_build) finally: try: state.rollback() finally: try: jwack.force_return_tokens() except Exception, e: traceback.print_exc(100, sys.stderr) err('unexpected error: %r\n' % e) rv = 1 except KeyboardInterrupt: if vars_init.is_toplevel: builder.await_log_reader() sys.exit(200) state.commit() if vars_init.is_toplevel: builder.await_log_reader() sys.exit(rv)
#!/usr/bin/env python2 import sys, os import state from logs import err if len(sys.argv[1:]) < 2: err('%s: at least 2 arguments expected.\n' % sys.argv[0]) sys.exit(1) target = sys.argv[1] deps = sys.argv[2:] for d in deps: assert (d != target) me = state.File(name=target) # Build the known dependencies of our primary target. This *does* require # grabbing locks. os.environ['REDO_NO_OOB'] = '1' argv = ['redo-ifchange'] + deps rv = os.spawnvp(os.P_WAIT, argv[0], argv) if rv: sys.exit(rv) # We know our caller already owns the lock on target, so we don't have to # acquire another one; tell redo-ifchange about that. Also, REDO_NO_OOB # persists from up above, because we don't want to do OOB now either. # (Actually it's most important for the primary target, since it's the one # who initiated the OOB in the first place.) os.environ['REDO_UNLOCKED'] = '1'
try: log.info("Fencing useless node %s ..." % (name)) self.get_local_node().fence(name) except Exception, e: # If fencing fail, this is not a big deal, VM are alive log.err("Fencing of %s failed:" % (name), e) return True # Succeeded ! except NotEnoughRamError: # Engage panic mode raise except NotInClusterError: # Next step of recovery process pass except Exception, e: log.err("Cannot get the VMs back:", e) if partial_failure: # Cannot recover, node still alive return False # Check if VM are still alive if len(vm_list) > 0: for node in self.get_nodes(): if node.ping(vm_list): log.warn("Some VM on %s are still alive !" % (name)) return False log.warn("All VM on %s are dead. Fencing now !" % (name)) else: log.warn("No VM running on %s. Fencing now !" % (name))
def joinRefused(reason): reason.trap(NodeRefusedError, RPCRefusedError) log.err("Join to cluster %s failed: Master %s has refused me: %s" % (core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage())) self.stopService()
#!/usr/bin/env python2 import sys, os import vars_init vars_init.init([]) import state, vars from logs import err if len(sys.argv[1:]) != 0: err('%s: no arguments expected.\n' % sys.argv[0]) sys.exit(1) cwd = os.getcwd() for f in state.files(): if f.is_generated and f.read_stamp() != state.STAMP_MISSING: print state.relpath(os.path.join(vars.BASE, f.name), cwd)
#!/usr/bin/env python2 import sys, os import vars, state from logs import err, debug2 if len(sys.argv) > 1: err('%s: no arguments expected.\n' % sys.argv[0]) sys.exit(1) if os.isatty(0): err('%s: you must provide the data to stamp on stdin\n' % sys.argv[0]) sys.exit(1) # hashlib is only available in python 2.5 or higher, but the 'sha' module # produces a DeprecationWarning in python 2.6 or higher. We want to support # python 2.4 and above without any stupid warnings, so let's try using hashlib # first, and downgrade if it fails. try: import hashlib except ImportError: import sha sh = sha.sha() else: sh = hashlib.sha1() while 1: b = os.read(0, 4096) sh.update(b) if not b: break csum = sh.hexdigest()
return d def _unregister(self, name): try: del self.status[name] except: pass try: self.disk.erase_slot(name) except DiskHeartbeatError, e: log.warn("Cannot erase slot: %s. You may have to reformat hearbeat disk." % (e)) except Exception, e: log.err("Diskheartbeat failure: %s." % (e)) self.panic() DNSCache.getInstance().delete(name) log.info("Node %s has been unregistered." % (name)) def unregisterNode(self, name): # Can unregister node even if in panic mode if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot unregister %s." % (name)) raise RPCRefusedError("Not master") if name not in self.status: log.warn("Unknown node %s try to quit the cluster." % (name)) raise NodeRefusedError("Unknown node "+name)
try: if vars_init.is_toplevel: builder.start_stdin_log_reader(status=opt.status, details=opt.details, pretty=opt.pretty, color=opt.color, debug_locks=opt.debug_locks, debug_pids=opt.debug_pids) for t in targets: if os.path.exists(t): f = state.File(name=t) if not f.is_generated: warn('%s: exists and not marked as generated; not redoing.\n' % f.nicename()) state.rollback() j = atoi(opt.jobs or 1) if j < 1 or j > 1000: err('invalid --jobs value: %r\n' % opt.jobs) jwack.setup(j) try: assert(state.is_flushed()) retcode = builder.main(targets, lambda t: (True, True)) assert(state.is_flushed()) finally: try: state.rollback() finally: try: jwack.force_return_tokens() except Exception, e: traceback.print_exc(100, sys.stderr) err('unexpected error: %r\n' % e) retcode = 1