def drain(self): if self.isDrainable(): self.isOpen = False self.log("Draining...") drained = parallelExec(self._drainActor, self.actors.values()) self.log("Drained.") gevent.spawn_later(2, _stopAllActors) return True else: self.log("Cannot drain, some Actors are not drainable.") return False
def _svc_instance_draining(self): while not self.stopEvent.wait(60 * 1): now = int(time.time()) # First we evaluate actors with a time_to_drain. for uid, info in self.actorInfo.items(): if info['time_to_drain'] is None: continue # This makes the assumption that the actor is running isolated, and if not the # entire actor host instance will be dained anyway. if now > info['start'] + info['time_to_drain']: if self._isDrainable(info['instance']): self._log( 'Actor %s has reached time_to_drain, draining.' % info['name']) self._doDrainInstance(info['instance']) self.isInstanceChanged.set() else: self._log( 'Actor %s has reached time_to_drain, but instance marked undrainable.' % info['name']) # Then we look at the general draining case. currentMemory = psutil.virtual_memory() # We start looking at draining if we hit more than 80% usage globally. if currentMemory.percent < self.highMemWatermark: #self._log( "Memory usage at %s percent, nothing to do." % currentMemory.percent ) continue self._log( "High memory watermark reached, trying to drain some instances." ) now = time.time() drainable = [ x for x in parallelExec(self._isDrainable, self.processes[:]) if type(x) is dict ] self._log("Found %d instances available for draining." % len(drainable)) oldest = None for instance in drainable: if instance['p'] is not None: if oldest is None: oldest = instance elif oldest['start'] > instance['start']: oldest = instance # Drain the oldest if we have one. if oldest is not None: self._log('Trying to drain %s' % oldest['id']) # Remove all actors in that instance from the directory before draining. self._doDrainInstance(oldest) self.isInstanceChanged.set()
def _svc_receiveOpsTasks( self ): z = self.opsSocket.getChild() while not self.stopEvent.wait( 0 ): data = z.recv() if data is not False and 'req' in data: action = data[ 'req' ] #start = time.time() #self._log( "Received new ops request: %s" % action ) if 'keepalive' == action: z.send( successMessage() ) if 'from' in data and data[ 'from' ] not in self.nodes: self._log( "Discovered new node: %s" % data[ 'from' ] ) self._connectToNode( data[ 'from' ] ) for other in data.get( 'others', [] ): if other not in self.nodes: self._log( "Discovered new node: %s" % other ) self._connectToNode( other ) elif 'start_actor' == action: if not self._isPrivileged( data ): z.send( errorMessage( 'unprivileged' ) ) elif 'actor_name' not in data or 'cat' not in data: z.send( errorMessage( 'missing information to start actor' ) ) else: actorName = data[ 'actor_name' ] categories = data[ 'cat' ] realm = data.get( 'realm', 'global' ) parameters = data.get( 'parameters', {} ) resources = data.get( 'resources', {} ) ident = data.get( 'ident', None ) trusted = data.get( 'trusted', [] ) n_concurrent = data.get( 'n_concurrent', 1 ) is_drainable = data.get( 'is_drainable', False ) time_to_drain = data.get( 'time_to_drain', None ) owner = data.get( 'owner', None ) isIsolated = data.get( 'isolated', False ) log_level = data.get( 'loglevel', None ) log_dest = data.get( 'logdest', None ) uid = str( uuid.uuid4() ) port = self._getAvailablePortForUid( uid ) instance = self._getInstanceForActor( isIsolated ) if instance is not None: self._setActorMtd( uid, instance, actorName, realm, isIsolated, owner, parameters, resources, time_to_drain ) newMsg = instance[ 'socket' ].request( { 'req' : 'start_actor', 'actor_name' : actorName, 'realm' : realm, 'uid' : uid, 'ip' : self.ifaceIp4, 'port' : port, 'parameters' : parameters, 'resources' : resources, 'ident' : ident, 'trusted' : trusted, 'n_concurrent' : n_concurrent, 'is_drainable' : is_drainable, 'isolated' : isIsolated, 'loglevel' : log_level, 'logdest' : log_dest }, timeout = 30 ) else: newMsg = False if isMessageSuccess( newMsg ): self._log( "New actor loaded (isolation = %s, concurrent = %d), adding to directory" % ( isIsolated, n_concurrent ) ) # We always add a hardcoded special category _ACTORS/actorUid to provide a way for certain special actors # to talk to specific instances directly, but this is discouraged. with self.dirLock.writer(): self.reverseDir[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4, port ) self.directory.setdefault( realm, PrefixDict() ).setdefault( '_ACTORS/%s' % ( uid, ), {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4, port ) self.nonOptDir.setdefault( realm, {} ).setdefault( '_ACTORS/%s' % ( uid, ), {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4, port ) for category in categories: self.directory.setdefault( realm, PrefixDict() ).setdefault( category, {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4, port ) self.nonOptDir.setdefault( realm, {} ).setdefault( category, {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4, port ) self.isActorChanged.set() else: self._logCritical( 'Error loading actor %s: %s.' % ( actorName, newMsg ) ) self._removeUidFromDirectory( uid ) self._addTombstone( uid ) self._removeInstanceIfIsolated( instance ) z.send( newMsg ) elif 'kill_actor' == action: if not self._isPrivileged( data ): z.send( errorMessage( 'unprivileged' ) ) elif 'uid' not in data: z.send( errorMessage( 'missing information to stop actor' ) ) else: uids = data[ 'uid' ] if not isinstance( uids, ( tuple, list ) ): uids = ( uids, ) failed = [] for uid in uids: instance = self.actorInfo.get( uid, {} ).get( 'instance', None ) if instance is None: failed.append( errorMessage( 'actor not found' ) ) else: newMsg = instance[ 'socket' ].request( { 'req' : 'kill_actor', 'uid' : uid }, timeout = 20 ) if not isMessageSuccess( newMsg ): self._log( "failed to kill actor %s: %s" % ( uid, str( newMsg ) ) ) failed.append( newMsg ) if not self._removeUidFromDirectory( uid ): failed.append( errorMessage( 'error removing actor from directory after stop' ) ) self._addTombstone( uid ) self._removeInstanceIfIsolated( instance ) self.isActorChanged.set() if 0 != len( failed ): z.send( errorMessage( 'some actors failed to stop', failed ) ) else: z.send( successMessage() ) elif 'remove_actor' == action: if not self._isPrivileged( data ): z.send( errorMessage( 'unprivileged' ) ) elif 'uid' not in data: z.send( errorMessage( 'missing information to remove actor' ) ) else: uid = data[ 'uid' ] instance = self.actorInfo.get( uid, {} ).get( 'instance', None ) if instance is not None and self._removeUidFromDirectory( uid ): z.send( successMessage() ) self.isActorChanged.set() self._removeInstanceIfIsolated( instance ) else: z.send( errorMessage( 'actor to stop not found' ) ) elif 'host_info' == action: if self.lastHostInfo is None or time.time() >= self.lastHostInfoCheck + 10: self.lastHostInfoCheck = time.time() self.lastHostInfo = { 'info' : { 'cpu' : psutil.cpu_percent( percpu = True, interval = 2 ), 'mem' : psutil.virtual_memory().percent } } z.send( successMessage( self.lastHostInfo ) ) elif 'get_full_dir' == action: with self.dirLock.reader(): #z.send( successMessage( { 'realms' : { k : dict( v ) for k, v in self.directory.iteritems() }, 'reverse' : self.reverseDir } ), isSkipSanitization = True ) z.send( successMessage( { 'realms' : self.nonOptDir, 'reverse' : self.reverseDir, 'is_inited' : self.isInitialSyncDone } ), isSkipSanitization = True ) elif 'get_dir' == action: realm = data.get( 'realm', 'global' ) if 'cat' in data: z.send( successMessage( data = { 'endpoints' : self._getDirectoryEntriesFor( realm, data[ 'cat' ] ) } ) ) else: z.send( errorMessage( 'no category specified' ) ) elif 'get_cats_under' == action: realm = data.get( 'realm', 'global' ) if 'cat' in data: with self.dirLock.reader(): z.send( successMessage( data = { 'categories' : [ x for x in self.directory.get( realm, PrefixDict() ).startswith( data[ 'cat' ] ) if x != data[ 'cat' ] ] } ) ) else: z.send( errorMessage( 'no category specified' ) ) elif 'get_nodes' == action: nodeList = {} for k in self.nodes.keys(): nodeList[ k ] = { 'last_seen' : self.nodes[ k ][ 'last_seen' ] } z.send( successMessage( { 'nodes' : nodeList } ) ) elif 'flush' == action: if not self._isPrivileged( data ): z.send( errorMessage( 'unprivileged' ) ) else: resp = successMessage() actors = self.actorInfo.items() for uid, actor in actors: self._removeUidFromDirectory( uid ) results = parallelExec( lambda x: x[ 1 ][ 'instance' ][ 'socket' ].request( { 'req' : 'kill_actor', 'uid' : x[ 0 ] }, timeout = 30 ), actors ) if all( isMessageSuccess( x ) for x in results ): self._log( "all actors stopped" ) else: resp = errorMessage( 'error stopping actor' ) for uid, actor in actors: self._removeInstanceIfIsolated( actor[ 'instance' ] ) z.send( resp ) if isMessageSuccess( resp ): self.isActorChanged.set() elif 'get_dir_sync' == action: with self.dirLock.reader(): #z.send( successMessage( { 'directory' : { k : dict( v ) for k, v in self.directory.iteritems() }, 'tombstones' : self.tombstones, 'reverse' : self.reverseDir } ), isSkipSanitization = True ) z.send( successMessage( { 'directory' : self.nonOptDir, 'tombstones' : self.tombstones, 'reverse' : self.reverseDir } ), isSkipSanitization = True ) elif 'push_dir_sync' == action: if 'directory' in data and 'tombstones' in data and 'reverse' in data: z.send( successMessage() ) for uid, ts in data[ 'tombstones' ].iteritems(): self._addTombstone( uid, ts ) self._updateDirectoryWith( self.directory, self.nonOptDir, data[ 'directory' ], data[ 'reverse' ] ) else: z.send( errorMessage( 'missing information to update directory' ) ) elif 'get_full_mtd' == action: z.send( successMessage( { 'mtd' : self.actorInfo } ) ) elif 'get_load_info' == action: info = {} for instance in self.processes: tmp = instance[ 'socket' ].request( { 'req' : 'get_load_info' }, timeout = 5 ) if isMessageSuccess( tmp ): info.update( tmp[ 'data' ] ) z.send( successMessage( { 'load' : info } ) ) elif 'associate' == action: if not self._isPrivileged( data ): z.send( errorMessage( 'unprivileged' ) ) else: uid = data[ 'uid' ] category = data[ 'category' ] try: info = self.actorInfo[ uid ] with self.dirLock.writer(): self.directory.setdefault( info[ 'realm' ], PrefixDict() ).setdefault( category, {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4, info[ 'port' ] ) self.nonOptDir.setdefault( info[ 'realm' ], {} ).setdefault( category, {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4, info[ 'port' ] ) except: z.send( errorMessage( 'error associating, actor hosted here?' ) ) else: self.isActorChanged.set() z.send( successMessage() ) elif 'disassociate' == action: if not self._isPrivileged( data ): z.send( errorMessage( 'unprivileged' ) ) else: uid = data[ 'uid' ] category = data[ 'category' ] try: info = self.actorInfo[ uid ] with self.dirLock.writer(): self.directory[ info[ 'realm' ] ][ category ].pop( uid ) self.nonOptDir[ info[ 'realm' ] ][ category ].pop( uid ) if 0 == len( self.directory[ info[ 'realm' ] ][ category ] ): del( self.directory[ info[ 'realm' ] ][ category ] ) del( self.nonOptDir[ info[ 'realm' ] ][ category ] ) except: z.send( errorMessage( 'error associating, actor exists in category?' ) ) else: self.isActorChanged.set() z.send( successMessage() ) else: z.send( errorMessage( 'unknown request', data = { 'req' : action } ) ) #self._log( "Action %s done after %s seconds." % ( action, time.time() - start ) ) else: z.send( errorMessage( 'invalid request' ) ) self._logCritical( "Received completely invalid request" )
def __init__( self, configFile, logging_level, logging_dest, iface = None ): # Setting the signal handler to trigger the stop event global timeToStopEvent gevent.signal( signal.SIGQUIT, _stop ) gevent.signal( signal.SIGINT, _stop ) gevent.signal( signal.SIGTERM, _stop ) self._logger = None self._log_level = logging_level self._log_dest = logging_dest self._initLogging( logging_level, logging_dest ) self.stopEvent = timeToStopEvent self.py_beach_dir = None self.configFilePath = os.path.abspath( configFile ) self.configFile = None self.directory = {} self.isInitialSyncDone = False # This is an unoptimized version of self.directory we maintain because converting # the optimized version to a striaght is very expensive. self.nonOptDir = {} self.reverseDir = {} self.tombstones = {} self.actorInfo = {} self.ports_available = Set() self.nProcesses = 0 self.processes = [] self.seedNodes = [] self.directoryPort = None self.opsPort = 0 self.opsSocket = None self.port_range = ( 0, 0 ) self.interface = None self.ifaceIp4 = None self.nodes = {} self.peer_keepalive_seconds = 0 self.instance_keepalive_seconds = 0 self.tombstone_culling_seconds = 0 self.isActorChanged = gevent.event.Event() self.isTombstoneChanged = gevent.event.Event() self.dirLock = RWLock() self.lastHostInfo = None self.lastHostInfoCheck = 0 # Cleanup potentially old sockets os.system( 'rm -f /tmp/py_beach*' ) # Load default configs with open( self.configFilePath, 'r' ) as f: self.configFile = yaml.load( f ) self.py_beach_dir = os.path.dirname( os.path.abspath( __file__ ) ) os.chdir( os.path.dirname( os.path.abspath( self.configFilePath ) ) ) self.private_key = self.configFile.get( 'private_key', None ) if self.private_key is not None: with open( self.private_key, 'r' ) as f: key_path = self.private_key self.private_key = f.read() self._log( "Using shared key: %s" % key_path ) self.admin_token = self.configFile.get( 'admin_token', None ) self.nProcesses = self.configFile.get( 'n_processes', 0 ) if self.nProcesses == 0: self.nProcesses = multiprocessing.cpu_count() self._log( "Using %d instances per node" % self.nProcesses ) if iface is not None: self.interface = iface self.ifaceIp4 = _getIpv4ForIface( self.interface ) if self.ifaceIp4 is None: self._logCritical( "Could not use iface %s (from cli)." % self.interface ) sys.exit( -1 ) else: self.interface = self.configFile.get( 'interface', None ) if self.interface is not None: self.ifaceIp4 = _getIpv4ForIface( self.interface ) if self.ifaceIp4 is None: self._logCritical( "Could not use iface %s (from config)." % self.interface ) sys.exit( -1 ) # Building a list of interfaces to auto-detect defaultInterfaces = _getPublicInterfaces() while self.ifaceIp4 is None and 0 != len( defaultInterfaces ): self.interface = defaultInterfaces.pop() self.ifaceIp4 = _getIpv4ForIface( self.interface ) if self.ifaceIp4 is None: self._log( "Failed to use interface %s." % self.interface ) if self.ifaceIp4 is None: self._logCritical( "Could not find an interface to use." ) sys.exit( -1 ) self.seedNodes = self.configFile.get( 'seed_nodes', [] ) if 0 == len( self.seedNodes ): self.seedNodes.append( self.ifaceIp4 ) for s in self.seedNodes: self._log( "Using seed node: %s" % s ) self.directoryPort = _ZMREP( self.configFile.get( 'directory_port', 'ipc:///tmp/py_beach_directory_port' ), isBind = True, private_key = self.private_key ) self.opsPort = self.configFile.get( 'ops_port', 4999 ) self.opsSocket = _ZMREP( 'tcp://%s:%d' % ( self.ifaceIp4, self.opsPort ), isBind = True, private_key = self.private_key ) self._log( "Listening for ops on %s:%d" % ( self.ifaceIp4, self.opsPort ) ) self.port_range = ( self.configFile.get( 'port_range_start', 5000 ), self.configFile.get( 'port_range_end', 6000 ) ) self.ports_available.update( xrange( self.port_range[ 0 ], self.port_range[ 1 ] + 1 ) ) self.peer_keepalive_seconds = self.configFile.get( 'peer_keepalive_seconds', 60 ) self.instance_keepalive_seconds = self.configFile.get( 'instance_keepalive_seconds', 600 ) self.directory_sync_seconds = self.configFile.get( 'directory_sync_seconds', 600 ) self.tombstone_culling_seconds = self.configFile.get( 'tombstone_culling_seconds', 3600 ) self.instance_strategy = self.configFile.get( 'instance_strategy', 'random' ) self.highMemWatermark = self.configFile.get( 'high_mem_watermark', 80 ) # Bootstrap the seeds for s in self.seedNodes: self._connectToNode( s ) # Start services self._log( "Starting services" ) gevent.spawn_later( random.randint( 0, 3 ), self._svc_directory_requests ) gevent.spawn_later( random.randint( 0, 3 ), self._svc_instance_keepalive ) gevent.spawn_later( random.randint( 0, 3 ), self._svc_host_keepalive ) gevent.spawn_later( random.randint( 0, 3 ), self._svc_directory_sync ) gevent.spawn_later( random.randint( 0, 3 ), self._svc_cullTombstones ) gevent.spawn_later( random.randint( 0, 3 ), self._svc_applyTombstones ) gevent.spawn_later( random.randint( 0, 3 ), self._svc_cleanupCats ) gevent.spawn_later( random.randint( 0, 60 * 5 ), self._svc_instance_draining ) for _ in range( 20 ): gevent.spawn( self._svc_receiveOpsTasks ) gevent.spawn( self._svc_pushDirChanges ) # Start the instances for n in range( self.nProcesses ): self._startInstance( isIsolated = False ) # Wait to be signaled to exit self._log( "Up and running" ) timeToStopEvent.wait() # Any teardown required parallelExec( self._teardownInstance, self.processes[:] ) self._log( "Exiting." )
def __init__(self, configFile, instanceId, logging_level, logging_dest, interface): # Setting the signal handler to trigger the stop event which # is interpreted by each actor implementation global timeToStopEvent gevent.signal(signal.SIGQUIT, _stopAllActors) gevent.signal(signal.SIGINT, _stopAllActors) gevent.signal(signal.SIGTERM, _stopAllActors) self.instanceId = instanceId self._log_level = logging_level self._log_dest = logging_dest self._initLogging(logging_level, logging_dest) self.log("Initializing") self.stopEvent = timeToStopEvent self.isOpen = True self.actors = {} self.py_beach_dir = None self.configFilePath = configFile self.configFile = None self.interface = interface self.ifaceIp4 = _getIpv4ForIface(self.interface) with open(self.configFilePath, 'r') as f: self.configFile = yaml.load(f) self.py_beach_dir = os.path.dirname(os.path.abspath(__file__)) os.chdir(os.path.dirname(os.path.abspath(self.configFilePath))) self.private_key = self.configFile.get('private_key', None) if self.private_key is not None: with open(self.private_key, 'r') as f: key_path = self.private_key self.private_key = f.read() self.log("Using shared key: %s" % key_path) self.codeDirectory = self.configFile.get('code_directory', './') if '://' not in self.codeDirectory: self.codeDirectory = os.path.abspath(self.codeDirectory) Actor._code_directory_root = self.codeDirectory self.opsSocket = _ZMREP('ipc:///tmp/py_beach_instance_%s' % instanceId, isBind=True) #self.log( "Listening for ops on %s" % ( 'ipc:///tmp/py_beach_instance_%s' % instanceId, ) ) self.hostOpsPort = self.configFile.get('ops_port', 4999) self.hostOpsSocket = _ZMREP('tcp://%s:%d' % (self.ifaceIp4, self.hostOpsPort), isBind=False, private_key=self.private_key) ActorHandle._setHostDirInfo( self.configFile.get('directory_port', 'ipc:///tmp/py_beach_directory_port'), self.private_key) ActorHandleGroup._setHostDirInfo( 'tcp://%s:%d' % (self.ifaceIp4, self.hostOpsPort), self.private_key) for _ in range(20): gevent.spawn(self.svc_receiveTasks) gevent.spawn(self.svc_monitorActors) gevent.spawn(self.svc_reportUsage) #self.log( "Now open to actors" ) timeToStopEvent.wait() self.log("Exiting, stopping all actors.") parallelExec(lambda x: x.stop(), self.actors.values()) gevent.joinall(self.actors.values()) self.hostOpsSocket.close() self.opsSocket.close() self.log("All Actors exited, exiting.")