def onWorkerAdd(self, ip, ports, machineIdAsString): machineId = CumulusNative.MachineId(Hash.Hash.stringToHash(machineIdAsString)) if machineId <= self.machineId: logging.info("Worker %s detected worker %s, and waiting for incoming connection", self.machineId, machineId) #only connect one way. If the worker is larger than us, then we connect to it return guid = Hash.Hash.sha1(str(uuid.uuid4())) logging.info( "Worker %s detected worker %s and initiating connection with guid %s", self.machineId, machineId, guid ) with self.lock: # Track that we are trying to connect to this machine self.connectingMachines.add(machineId) ManagedThread.ManagedThread( target=self.onWorkerAdd2, args=(machineId, ip, ports, guid) ).start()
def logBadUforaVersionOnChannel(self, version): try: anId = CumulusNative.MachineId(Hash.Hash(0)) anId.__setstate__(version) logging.error( "CumulusService %s received a bad version message that is, " \ "in fact, a machineId: %s", self.machineId, anId ) except: logging.error( "CumulusService %s received a bad version message that is not a machineId: %s", self.machineId, repr(version))
def onWorkerDrop(self, machineIdAsString): with self.lock_: machineId = CumulusNative.MachineId( HashNative.Hash.stringToHash(machineIdAsString)) self.disconnectedMachines_.add(machineId) if machineId in self.desiredMachines_: self.desiredMachines_.discard(machineId) if machineId not in self.connectedMachines_: return self.connectedMachines_.discard(machineId) if len(self.connectedMachines_) == 0: self.onMachineCountWentToZero() self.cumulusClient.dropMachine(machineId)
def onWorkerAdd(self, ip, ports, machineIdAsString): machineId = CumulusNative.MachineId( HashNative.Hash.stringToHash(machineIdAsString)) with self.lock_: if self.isTornDown_: return logging.info("CumulusClient %s preparing to connect to %s", self.cumulusClientId, machineId) self.desiredMachines_.add(machineId) newThread = ManagedThread.ManagedThread( target=self.addDesiredMachine, args=(machineId, ip, ports)) self.connectingThreads_.append(newThread) self.connectingThreads_ = [ x for x in self.connectingThreads_ if x.isAlive() ] newThread.start()
def isOwnHashInHandshakeMessage(self, message): if message is None: logging.error( "CumulusService %s didn't receive own Id in handshake.", self.machineId) return False try: machineId = CumulusNative.MachineId(Hash.Hash(0)) machineId.__setstate__(message) except: machineId = "not a valid machine ID" if isinstance(machineId, str) or machineId != self.machineId: logging.error( "CumulusWorker %s received connection intended for another machine (%s). %s != %s", self.machineId, machineId, repr(message), repr(self.machineId.__getstate__())) return False return True
def onWorkerDrop(self, machineIdAsString): machineId = CumulusNative.MachineId(Hash.Hash.stringToHash(machineIdAsString)) if machineId == self.machineId: return logging.info("CumulusService %s dropped worker %s", self.machineId, machineId) try: hadMachine = False with self.lock: if machineId in self.connectedMachines: hadMachine = True self.connectingMachines.discard(machineId) self.connectedMachines.discard(machineId) self.droppedMachineIds.add(machineId) if hadMachine: self.cumulusWorker.dropMachine(machineId) except: logging.error("Failed to drop worker: %s", traceback.format_exc()) raise
def machineId(ix, seed = None): h = HashNative.Hash(ix) if seed is not None: h = h + HashNative.Hash.sha1(seed) return CumulusNative.MachineId(h)
def machineId(ix): return CumulusNative.MachineId(HashNative.Hash(ix))
def __init__(self, ownAddress, channelListener, channelFactory, eventHandler, callbackScheduler, diagnosticsDir, config, viewFactory, s3InterfaceFactory=None, objectStore=None): Stoppable.Stoppable.__init__(self) #acquire a machineId randomly, using uuid self.machineId = CumulusNative.MachineId( Hash.Hash.sha1(str(uuid.uuid4())) ) self.ownAddress = ownAddress self.callbackScheduler = callbackScheduler self.viewFactory = viewFactory self.s3InterfaceFactory = s3InterfaceFactory self.objectStore = objectStore self.threadsStarted_ = False self.connectedMachines = set() self.connectingMachines = set() # machines we are in the process of connecting to self.droppedMachineIds = set() self.lock = threading.RLock() self.cumulusMaxRamCacheSizeOverride = config.cumulusMaxRamCacheMB * 1024*1024 self.cumulusVectorRamCacheSizeOverride = config.cumulusVectorRamCacheMB * 1024*1024 self.cumulusThreadCountOverride = config.cumulusServiceThreadCount self.cumulusTrackTcmalloc = config.cumulusTrackTcmalloc self.eventHandler = eventHandler self.reconnectPersistentCacheIndexViewThreads = [] if config.cumulusDiskCacheStorageSubdirectory is not None: self.cumulusDiskCacheWantsDeletionOnTeardown = True self.cumulusDiskCacheStorageDir = os.path.join( config.cumulusDiskCacheStorageDir, config.cumulusDiskCacheStorageSubdirectory ) else: self.cumulusDiskCacheWantsDeletionOnTeardown = False self.cumulusDiskCacheStorageDir = config.cumulusDiskCacheStorageDir self._stopEvent = threading.Event() self._channelListener = channelListener assert len(self._channelListener.ports) == 2 self._channelFactory = channelFactory Runtime.initialize() ModuleImporter.initialize() self.cumulusActiveMachines = CumulusActiveMachines.CumulusActiveMachines( self.viewFactory ) self.cumulusChannelFactoryThread = ManagedThread.ManagedThread( target=self._channelListener.start ) self.vdm = VectorDataManager.constructVDM( callbackScheduler, self.cumulusVectorRamCacheSizeOverride, self.cumulusMaxRamCacheSizeOverride ) if self.cumulusTrackTcmalloc: self.vdm.getMemoryManager().enableCountTcMallocMemoryAsEcMemory() self.persistentCacheIndex = CumulusNative.PersistentCacheIndex( viewFactory.createView(retrySeconds=10.0, numRetries=10), callbackScheduler ) self.vdm.setPersistentCacheIndex(self.persistentCacheIndex) self.deleteCumulusDiskCacheIfNecessary() self.offlineCache = CumulusNative.DiskOfflineCache( callbackScheduler, self.cumulusDiskCacheStorageDir, config.cumulusDiskCacheStorageMB * 1024 * 1024, config.cumulusDiskCacheStorageFileCount ) #If the "s3InterfaceFactory" is not in-memory, we use real out of process python. #it would be better if this were more explicit outOfProcess = self.s3InterfaceFactory is not None and self.s3InterfaceFactory.isCompatibleWithOutOfProcessDownloadPool self.outOfProcessPythonTasks = OutOfProcessPythonTasks.OutOfProcessPythonTasks(outOfProcess=outOfProcess) self.vdm.initializeOutOfProcessPythonTasks(self.outOfProcessPythonTasks.nativeTasks) checkpointInterval = config.cumulusCheckpointIntervalSeconds if checkpointInterval == 0: checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.None() else: checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.Periodic( checkpointInterval, 1024 * 1024 ) self.cumulusWorker = self.constructCumlusWorker( callbackScheduler, CumulusNative.CumulusWorkerConfiguration( self.machineId, self.cumulusThreadCountOverride, checkpointPolicy, ExecutionContext.createContextConfiguration(), diagnosticsDir or "" ), self.vdm, self.offlineCache, eventHandler ) self.datasetLoadService = None if self.s3InterfaceFactory: externalDatasetChannel = self.cumulusWorker.getExternalDatasetRequestChannel( callbackScheduler ) self.datasetLoadService = PythonIoTaskService.PythonIoTaskService( self.s3InterfaceFactory, self.objectStore, self.vdm, externalDatasetChannel.makeQueuelike(callbackScheduler) ) self.cumulusWorker.startComputations() if self.datasetLoadService: self.datasetLoadService.startService()
def doChannelHandshake(self, channel): try: logging.debug("Worker %s beginning channel handshake", self.machineId) version = channel.getTimeout(HANDSHAKE_TIMEOUT) if version is None: logging.error( "CAN'T ACCEPT CONNECTION!\n" "CumulusService %s couldn't read client version within the configured timeout", self.machineId ) if version != ufora.version: self.logBadUforaVersionOnChannel(version) channel.disconnect() return logging.debug( "CumulusService %s accepted connection from client with version %s", self.machineId, version ) msgThatShouldBeMyOwnHash = channel.getTimeout(HANDSHAKE_TIMEOUT) if not self.isOwnHashInHandshakeMessage(msgThatShouldBeMyOwnHash): channel.disconnect() return msg = channel.getTimeout(HANDSHAKE_TIMEOUT) if msg is None: logging.error( "CAN'T ACCEPT CONNECTION!\n" "Worker %s didn't received remote machine ID during handshake", self.machineId ) channel.disconnect() return clientOrMachine = CumulusNative.CumulusClientOrMachine.Machine( CumulusNative.MachineId( Hash.Hash(0) ) ) clientOrMachine.__setstate__(msg) hashGuid = Hash.Hash(0) msg = channel.getTimeout(HANDSHAKE_TIMEOUT) if msg is None: logging.error( "CAN'T ACCEPT CONNECTION!\n" "Worker %s didn't received handshake GUID", self.machineId ) channel.disconnect() return hashGuid.__setstate__(msg) logging.debug( "Worker %s accepted connection with guid %s from %s", self.machineId, hashGuid, clientOrMachine ) channel.write( ModuleImporter.builtinModuleImplVal().hash.__getstate__() ) with self.lock: self._channelListener.setGroupIdForAcceptedChannel( channel, (clientOrMachine, hashGuid) ) logging.debug("CumulusService %s added a channel to group %s", self.machineId, (clientOrMachine, hashGuid)) except: logging.error("FAILED TO PROCESS INCOMING CONNECTION: %s", traceback.format_exc()) channel.disconnect()
def __init__(self, ownAddress, channelListener, channelFactory, eventHandler, callbackScheduler, diagnosticsDir, config, viewFactory): Stoppable.Stoppable.__init__(self) #acquire a machineId randomly, using uuid self.machineId = CumulusNative.MachineId( Hash.Hash.sha1(str(uuid.uuid4()))) self.ownAddress = ownAddress self.callbackScheduler = callbackScheduler self.viewFactory = viewFactory self.threadsStarted_ = False self.connectedMachines = set() self.connectingMachines = set( ) # machines we are in the process of connecting to self.droppedMachineIds = set() self.lock = threading.RLock() self.cumulusMaxRamCacheSizeOverride = config.cumulusMaxRamCacheMB * 1024 * 1024 self.cumulusVectorRamCacheSizeOverride = config.cumulusVectorRamCacheMB * 1024 * 1024 self.cumulusThreadCountOverride = config.cumulusServiceThreadCount self.cumulusTrackTcMalloc = config.cumulusTrackTcmalloc self.reconnectPersistentCacheIndexViewThreads = [] if config.cumulusDiskCacheStorageSubdirectory is not None: self.cumulusDiskCacheWantsDeletionOnTeardown = True self.cumulusDiskCacheStorageDir = os.path.join( config.cumulusDiskCacheStorageDir, config.cumulusDiskCacheStorageSubdirectory) else: self.cumulusDiskCacheWantsDeletionOnTeardown = False self.cumulusDiskCacheStorageDir = config.cumulusDiskCacheStorageDir logging.info( "Creating a CumulusService with ram cache of %s / %s MB and %s threads", self.cumulusVectorRamCacheSizeOverride / 1024.0 / 1024.0, self.cumulusMaxRamCacheSizeOverride / 1024.0 / 1024.0, self.cumulusThreadCountOverride) self._stopEvent = threading.Event() self._channelListener = channelListener assert len(self._channelListener.ports) == 2 self._channelFactory = channelFactory Runtime.initialize() ModuleImporter.initialize() self.cumulusActiveMachines = CumulusActiveMachines.CumulusActiveMachines( self.viewFactory) self.cumulusChannelFactoryThread = ManagedThread.ManagedThread( target=self._channelListener.start) self.vdm = VectorDataManager.constructVDM( callbackScheduler, self.cumulusVectorRamCacheSizeOverride, self.cumulusMaxRamCacheSizeOverride) if self.cumulusTrackTcMalloc: logging.info( "CumulusService enabling track-tc-malloc memory with a max cache of %s MB", self.cumulusMaxRamCacheSizeOverride / 1024 / 1024.0) self.vdm.getMemoryManager().enableCountTcMallocMemoryAsEcMemory() self.persistentCacheIndex = CumulusNative.PersistentCacheIndex( viewFactory.createView(retrySeconds=10.0, numRetries=10), callbackScheduler) self.vdm.setPersistentCacheIndex(self.persistentCacheIndex) self.deleteCumulusDiskCacheIfNecessary() self.offlineCache = CumulusNative.DiskOfflineCache( callbackScheduler, self.cumulusDiskCacheStorageDir, config.cumulusDiskCacheStorageMB * 1024 * 1024, config.cumulusDiskCacheStorageFileCount) checkpointInterval = config.cumulusCheckpointIntervalSeconds if checkpointInterval == 0: checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.None () else: checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.Periodic( checkpointInterval, 1024 * 1024) self.cumulusWorker = self.constructCumlusWorker( callbackScheduler, CumulusNative.CumulusWorkerConfiguration( self.machineId, self.cumulusThreadCountOverride, checkpointPolicy, ExecutionContext.createContextConfiguration(), diagnosticsDir or ""), self.vdm, self.offlineCache, eventHandler) #externalDatasetChannel = self.cumulusWorker.getExternalDatasetRequestChannel( #callbackScheduler #) #self.datasetLoadService = PythonIoTaskService.PythonIoTaskService( #settings.s3InterfaceFactory, #settings.objectStore, #self.vdm, #externalDatasetChannel.makeQueuelike(callbackScheduler) #) self.cumulusWorker.startComputations()