def _sendPing(self, first, last): """Do the actual sending""" changed=self.cmdsChanged self.cmdsChanged=False with self.runCondVar: # first write the items to xml cmds=self.worker._getWorkloads() co=StringIO() co.write('<heartbeat worker_id="%s">'%self.workerID) for item in cmds: if item.running: item.hbi.writeXML(co) for subwl in item.joinedTo: subwl.hbi.writeXML(co) co.write("</heartbeat>") clnt=WorkerMessage() resp=clnt.workerHeartbeatRequest(self.workerID, self.workerDir, first, last, changed, co.getvalue()) presp=ProcessedResponse(resp) if last: timestr=" last" else: timestr="" if first: timestr+=" first" if changed: timestr+=" update" log.debug("Sent%s heartbeat signal. Result was %s"% (timestr, presp.getStatus())) if presp.getStatus() != "OK": # if the response was not OK, the upstream server thinks we're # dead and has signaled that to the originating server. We # should just die now. faulty=presp.getData() log.info("Error from heartbeat request. Stopping %s"%str(faulty)) #log.error("Got error from heartbeat request. Stopping worker.") if ( type(faulty) == type(dict()) and 'faulty' in faulty): for faultyItem in faulty['faulty']: self.worker.killWorkload(faultyItem) else: pass #sys.exit(1) respData=presp.getData() if type(respData) == type(dict()): rettime=int(respData['heartbeat-time']) self.randomFile=respData['random-file'] self._createRandomFile() else: rettime=int(respData) #rettime=int(presp.getData()) log.debug("Waiting %s seconds for next ping"%(rettime)) return rettime
def _sendPing(self, first, last): """Do the actual sending""" changed = self.cmdsChanged self.cmdsChanged = False with self.runCondVar: # first write the items to xml cmds = self.worker._getWorkloads() co = StringIO() co.write('<heartbeat worker_id="%s">' % self.workerID) for item in cmds: if item.running: item.hbi.writeXML(co) for subwl in item.joinedTo: subwl.hbi.writeXML(co) co.write("</heartbeat>") clnt = WorkerMessage() resp = clnt.workerHeartbeatRequest(self.workerID, self.workerDir, first, last, changed, co.getvalue()) presp = ProcessedResponse(resp) if last: timestr = " last" else: timestr = "" if first: timestr += " first" if changed: timestr += " update" log.debug("Sent%s heartbeat signal. Result was %s" % (timestr, presp.getStatus())) if presp.getStatus() != "OK": # if the response was not OK, the upstream server thinks we're # dead and has signaled that to the originating server. We # should just die now. faulty = presp.getData() log.info("Error from heartbeat request. Stopping %s" % str(faulty)) #log.error("Got error from heartbeat request. Stopping worker.") if (type(faulty) == type(dict()) and 'faulty' in faulty): for faultyItem in faulty['faulty']: self.worker.killWorkload(faultyItem) else: pass #sys.exit(1) respData = presp.getData() if type(respData) == type(dict()): rettime = int(respData['heartbeat-time']) self.randomFile = respData['random-file'] self._createRandomFile() else: rettime = int(respData) #rettime=int(presp.getData()) log.debug("Waiting %s seconds for next ping" % (rettime)) return rettime
def run(self, serverState, request, response): conf = ServerConf() host = request.getParam('host') client_secure_port = request.getParam('client_secure_port') result = dict() #do we have a server with this hostname or fqdn? connectedNodes = conf.getNodes() if (connectedNodes.hostnameOrFQDNExists(host) == False): serv = RawServerMessage(host, client_secure_port) resp = ProcessedResponse(serv.sendAddNodeRequest(host)) if resp.isOK(): result = resp.getData() nodeConnectRequest = NodeConnectRequest(result['serverId'], int(client_secure_port),None,None,result['fqdn'],host) conf.addSentNodeConnectRequest(nodeConnectRequest) result['nodeConnectRequest']=nodeConnectRequest log.info("Added node %s" % host) response.add('', result) else: response.add("Remote server said: %s"%resp.getMessage(), status="ERROR") else: errorMessage = "%s is already trusted" % host response.add(errorMessage, status="ERROR") log.info(errorMessage)
def run(self, serverState, request, response): conf = ServerConf() host = request.getParam('host') client_secure_port = request.getParam('client_secure_port') result = dict() #do we have a server with this hostname or fqdn? connectedNodes = conf.getNodes() if (connectedNodes.hostnameOrFQDNExists(host) == False): serv = RawServerMessage(host, client_secure_port) resp = ProcessedResponse(serv.sendAddNodeRequest(host)) if resp.isOK(): result = resp.getData() nodeConnectRequest = NodeConnectRequest( result['serverId'], int(client_secure_port), None, None, result['fqdn'], host) conf.addSentNodeConnectRequest(nodeConnectRequest) result['nodeConnectRequest'] = nodeConnectRequest log.info("Added node %s" % host) response.add('', result) else: response.add("Remote server said: %s" % resp.getMessage(), status="ERROR") else: errorMessage = "%s is already trusted" % host response.add(errorMessage, status="ERROR") log.info(errorMessage)
def requestNetworkTopology(topology,serverState=None): """ Asks each neigbouring node for their network topology inputs: topology:Nodes The list of the topology generated so far serverState:ServerState if provided worker states are fetched. since this method is called by getNetworkTopology() which in turn is called from places where we do not pass (and don't want) the serverState we provide this option. Also it is not needed as the calling server always knows the most up to date state of its own workers. """ conf = ServerConf() thisNode = Node.getSelfNode(conf) thisNode.setNodes(conf.getNodes()) topology.addNode(thisNode) if serverState: thisNode.workerStates = WorkerStateHandler.getConnectedWorkers(serverState.getWorkerStates()) for node in thisNode.getNodes().nodes.itervalues(): if topology.exists(node.getId()) == False: #connect to correct node if node.isConnected(): try: clnt = DirectServerMessage(node,conf=conf) #send along the current topology rawresp = clnt.networkTopology(topology) processedResponse = ProcessedResponse(rawresp) topology = processedResponse.getData() except ServerConnectionError as e: #we cannot connect to the node, # and its marked as unreachable #we must still add it to the topology log.error("node %s unreachable when asking for network " "topology: error was %s"%(node.getId(),e.__str__())) topology.addNode(node) #todo notify in topology that this node is not connected? return topology
def run(self, serverState, request, response): workerID=request.getParam('worker_id') workerDir=request.getParam('worker_dir') iteration=request.getParam('iteration') itemsXML=request.getParam('heartbeat_items') version=0 if request.hasParam('version'): version=int(request.getParam('version')) hwr=cpc.command.heartbeat.HeartbeatItemReader() hwr.readString(itemsXML, "worker heartbeat items") heartbeatItems=hwr.getItems() # The worker data list workerDataList=serverState.getWorkerDataList() haveADir=False # Order the heartbeat items by destination server destList={} Nhandled=0 for item in heartbeatItems: dest=item.getServerName() item.checkRunDir() if item.getHaveRunDir(): haveADir=True if dest in destList: destList[dest].append(item) else: destList[dest]=[item] Nhandled+=1 if haveADir: if iteration!="final": workerDataList.add(workerDir) if iteration=="final": workerDataList.remove(workerDir) # get my own name to compare selfNode= Node.getSelfNode(serverState.conf) selfName = selfNode.getId() #updating the status at every hearbeat. This is how we knwo that the worker # is still talking to the server serverState.setWorkerState(WorkerStatus.WORKER_STATUS_CONNECTED,workerID, request.headers['originating-client']) # now iterate over the destinations, and send them their heartbeat # items. # Once we have many workers, this would be a place to pool heartbeat # items and send them as one big request. faultyItems=[] for dest, items in destList.iteritems(): if dest == selfName: ret=serverState.getRunningCmdList().ping(workerID, workerDir, iteration, items, True, faultyItems) else: msg=ServerMessage(dest) co=StringIO() co.write('<heartbeat worker_id="%s" worker_server_id="%s">'% (workerID, selfName)) for item in items: item.writeXML(co) co.write('</heartbeat>') resp = msg.heartbeatForwardedRequest(workerID, workerDir, selfName, iteration, co.getvalue()) presp=ProcessedResponse(resp) if presp.getStatus() != "OK": log.info("Heartbeat response from %s not OK"%dest) retitems=presp.getData() for item in retitems: faultyItems.append(item) if version > 1: retData = { 'heartbeat-time' : serverState.conf. getHeartbeatTime(), 'random-file': workerDataList.getRnd(workerDir) } else: retData=serverState.conf.getHeartbeatTime() if len(faultyItems)==0: response.add('', data=retData) else: if version > 1: retData['faulty']=faultyItems # TODO: per-workload error reporting response.add('Heatbeat NOT OK', status="ERROR", data=retData) log.info("Handled %d heartbeat signal items."%(Nhandled))