def _register_service(self, port=None, installSignalHandlers=1): if self.__svcrgy: self.logs['main'].info( "Registering service with service registery %s... " % self.__svcrgy) svcrgy = hodXRClient(self.__svcrgy, None, None, 0, 0, installSignalHandlers) if self._xrc and self._http: svcrgy.registerService(self._cfg['userid'], self._serviceID, self.hostname, self.name, 'hod', { 'xrs' : "http://%s:%s" % ( self._xrc.server_address[0], self._xrc.server_address[1]),'http' : "http://%s:%s" % (self._http.server_address[0], self._http.server_address[1])}) elif self._xrc: svcrgy.registerService(self._cfg['userid'], self._serviceID, self.hostname, self.name, 'hod', { 'xrs' : "http://%s:%s" % ( self._xrc.server_address[0], self._xrc.server_address[1]),}) elif self._http: svcrgy.registerService(self._cfg['userid'], self._serviceID, self.hostname, self.name, 'hod', {'http' : "http://%s:%s" % (self._http.server_address[0], self._http.server_address[1]),}) else: svcrgy.registerService(self._cfg['userid'], self._serviceID, self.hostname, name, 'hod', {} )
def testFailure(self): """HOD should raise Exception when unregistered rpc is called""" global serverPort client = hodXRClient('http://localhost:' + str(serverPort), retryRequests=False) self.assertRaises(Exception, client.noMethod) pass
def testTimeout(self): """HOD should raise Exception when rpc call times out""" # Give client some random nonexistent url serverPort = ServiceUtil.getUniqRandomPort(h='localhost',low=40000,high=50000) client = hodXRClient('http://localhost:' + str(serverPort), retryRequests=False) self.assertRaises(Exception, client.testing) pass
def testTimeout(self): """HOD should raise Exception when rpc call times out""" # Give client some random nonexistent url serverPort = ServiceUtil.getUniqRandomPort(h='localhost', low=40000, high=50000) client = hodXRClient('http://localhost:' + str(serverPort), retryRequests=False) self.assertRaises(Exception, client.testing) pass
def testInterrupt(self): """ HOD should raise HodInterruptException when interrupted""" def interrupt(testClass): testClass.assertRaises(HodInterruptException, client.testing) serverPort = ServiceUtil.getUniqRandomPort(h='localhost',low=40000,high=50000) client = hodXRClient('http://localhost:' + str(serverPort)) myThread = threading.Thread(name='testinterrupt', target=interrupt,args=(self,)) # Set the global interrupt hodInterrupt.setFlag() myThread.start() myThread.join() pass
def testInterrupt(self): """ HOD should raise HodInterruptException when interrupted""" def interrupt(testClass): testClass.assertRaises(HodInterruptException, client.testing) serverPort = ServiceUtil.getUniqRandomPort(h='localhost', low=40000, high=50000) client = hodXRClient('http://localhost:' + str(serverPort)) myThread = threading.Thread(name='testinterrupt', target=interrupt, args=(self, )) # Set the global interrupt hodInterrupt.setFlag() myThread.start() myThread.join() pass
def _register_service(self, port=None, installSignalHandlers=1): if self.__svcrgy: self.logs['main'].info( "Registering service with service registery %s... " % self.__svcrgy) svcrgy = hodXRClient(self.__svcrgy, None, None, 0, 0, installSignalHandlers) if self._xrc and self._http: svcrgy.registerService( self._cfg['userid'], self._serviceID, self.hostname, self.name, 'hod', { 'xrs': "http://%s:%s" % (self._xrc.server_address[0], self._xrc.server_address[1]), 'http': "http://%s:%s" % (self._http.server_address[0], self._http.server_address[1]) }) elif self._xrc: svcrgy.registerService( self._cfg['userid'], self._serviceID, self.hostname, self.name, 'hod', { 'xrs': "http://%s:%s" % (self._xrc.server_address[0], self._xrc.server_address[1]), }) elif self._http: svcrgy.registerService( self._cfg['userid'], self._serviceID, self.hostname, self.name, 'hod', { 'http': "http://%s:%s" % (self._http.server_address[0], self._http.server_address[1]), }) else: svcrgy.registerService(self._cfg['userid'], self._serviceID, self.hostname, name, 'hod', {})
def __init__(self, cfg, log, **kwds): """starts nodepool and services""" self.download = False self.httpServer = None self.cfg = cfg self.log = log self.__hostname = local_fqdn() self.workDirs = None # ref to the idle job tracker object. self.__jtMonitor = None self.__idlenessDetected = False self.__stopInProgress = False self.__isStopped = False # to let main exit self.__exitCode = 0 # exit code with which the ringmaster main method should return self.workers_per_ring = self.cfg["ringmaster"]["workers_per_ring"] self.__initialize_signal_handlers() sdd = self.cfg["servicedesc"] gsvc = None for key in sdd: gsvc = sdd[key] break npd = self.cfg["nodepooldesc"] self.np = NodePoolUtil.getNodePool(npd, cfg, log) self.log.debug("Getting service ID.") self.serviceId = self.np.getServiceId() self.log.debug("Got service ID: %s" % self.serviceId) self.tarSrcLoc = None if self.cfg["ringmaster"].has_key("hadoop-tar-ball"): self.download = True self.tarSrcLoc = self.cfg["ringmaster"]["hadoop-tar-ball"] self.cd_to_tempdir() if self.download: self.__copy_tarball(os.getcwd()) self.basename = self.__find_tarball_in_dir(os.getcwd()) if self.basename is None: raise Exception("Did not find tarball copied from %s in %s." % (self.tarSrcLoc, os.getcwd())) self.serviceAddr = to_http_url(self.cfg["ringmaster"]["svcrgy-addr"]) self.log.debug("Service registry @ %s" % self.serviceAddr) self.serviceClient = hodXRClient(self.serviceAddr) self.serviceDict = {} try: sdl = self.cfg["servicedesc"] workDirs = self.getWorkDirs(cfg) hdfsDesc = sdl["hdfs"] hdfs = None # Determine hadoop Version hadoopVers = hadoopVersion(self.__getHadoopDir(), self.cfg["hodring"]["java-home"], self.log) if (hadoopVers["major"] == None) or (hadoopVers["minor"] == None): raise Exception( "Could not retrive the version of Hadoop." + " Check the Hadoop installation or the value of the hodring.java-home variable." ) if hdfsDesc.isExternal(): hdfs = HdfsExternal(hdfsDesc, workDirs, version=int(hadoopVers["minor"])) hdfs.setMasterParams(self.cfg["gridservice-hdfs"]) else: hdfs = Hdfs( hdfsDesc, workDirs, 0, version=int(hadoopVers["minor"]), workers_per_ring=self.workers_per_ring ) self.serviceDict[hdfs.getName()] = hdfs mrDesc = sdl["mapred"] mr = None if mrDesc.isExternal(): mr = MapReduceExternal(mrDesc, workDirs, version=int(hadoopVers["minor"])) mr.setMasterParams(self.cfg["gridservice-mapred"]) else: mr = MapReduce( mrDesc, workDirs, 1, version=int(hadoopVers["minor"]), workers_per_ring=self.workers_per_ring ) self.serviceDict[mr.getName()] = mr except: self.log.critical( "Exception in creating Hdfs and Map/Reduce descriptor objects: \ %s." % get_exception_error_string() ) self.log.debug(get_exception_string()) raise # should not be starting these in a constructor ringMasterServer.startService(self.serviceDict, cfg, self.np, log, self) self.rpcserver = ringMasterServer.getAddress() self.httpAddress = None self.tarAddress = None hostname = socket.gethostname() if self.download: self.httpServer = threadedHTTPServer(hostname, self.cfg["ringmaster"]["http-port-range"]) self.httpServer.serve_forever() self.httpAddress = "http://%s:%d/" % (self.httpServer.server_address[0], self.httpServer.server_address[1]) self.tarAddress = "%s%s" % (self.httpAddress, self.basename) ringMasterServer.instance.logMasterSources.registerTarSource(hostname, self.tarAddress) else: self.log.debug("Download not set.") self.log.debug( "%s %s %s %s %s" % (self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod") ) if self.cfg["ringmaster"]["register"]: if self.httpAddress: self.serviceClient.registerService( self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod", {"xrs": self.rpcserver, "http": self.httpAddress}, ) else: self.serviceClient.registerService( self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod", {"xrs": self.rpcserver}, ) self.log.debug("Registered with serivce registry: %s." % self.serviceAddr) hodRingPath = os.path.join(cfg["ringmaster"]["base-dir"], "bin", "hodring") hodRingWorkDir = os.path.join(cfg["hodring"]["temp-dir"], "hodring" + "_" + getpass.getuser()) self.cfg["hodring"]["hodring"] = [hodRingWorkDir] self.cfg["hodring"]["svcrgy-addr"] = self.cfg["ringmaster"]["svcrgy-addr"] self.cfg["hodring"]["service-id"] = self.np.getServiceId() self.cfg["hodring"]["ringmaster-xrs-addr"] = self.__url_to_addr(self.rpcserver) if self.tarSrcLoc != None: cfg["hodring"]["download-addr"] = self.tarAddress self.__init_job_tracker_monitor(ringMasterServer.instance.logMasterSources)
def start(self): """Run and maintain hodring commands""" try: if self._cfg.has_key('download-addr'): self._http = threadedHTTPServer('', self._cfg['http-port-range']) self.log.info("Starting http server...") self._http.serve_forever() self.log.debug("http://%s:%d" % (self._http.server_address[0], self._http.server_address[1])) hodBaseService.start(self) ringXRAddress = None if self._cfg.has_key('ringmaster-xrs-addr'): ringXRAddress = "http://%s:%s/" % (self._cfg['ringmaster-xrs-addr'][0], self._cfg['ringmaster-xrs-addr'][1]) self.log.debug("Ringmaster at %s" % ringXRAddress) self.log.debug("Creating service registry XML-RPC client.") serviceClient = hodXRClient(to_http_url( self._cfg['svcrgy-addr'])) if ringXRAddress == None: self.log.info("Did not get ringmaster XML-RPC address. Fetching information from service registry.") ringList = serviceClient.getServiceInfo(self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') self.log.debug(pprint.pformat(ringList)) if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = 0 while (ringXRAddress == None and count < 3000): ringList = serviceClient.getServiceInfo(self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = count + 1 time.sleep(.2) if ringXRAddress == None: raise Exception("Could not get ringmaster XML-RPC server address.") self.log.debug("Creating ringmaster XML-RPC client.") ringClient = hodXRClient(ringXRAddress) id = self.hostname + "_" + str(os.getpid()) if 'download-addr' in self._cfg: self.__download_package(ringClient) else: self.log.debug("Did not find a download address.") cmdlist = [] firstTime = True increment = 0 hadoopStartupTime = 2 cmdlist = ringClient.getCommand(id) while (cmdlist == []): if firstTime: sleepTime = increment + self._cfg['cmd-retry-initial-time'] + hadoopStartupTime\ + random.uniform(0,self._cfg['cmd-retry-interval']) firstTime = False else: sleepTime = increment + self._cfg['cmd-retry-initial-time'] + \ + random.uniform(0,self._cfg['cmd-retry-interval']) self.log.debug("Did not get command list. Waiting for %s seconds." % (sleepTime)) time.sleep(sleepTime) increment = increment + 1 cmdlist = ringClient.getCommand(id) self.log.debug(pformat(cmdlist)) cmdDescs = [] for cmds in cmdlist: cmdDescs.append(CommandDesc(cmds['dict'], self.log)) self._cfg['commanddesc'] = cmdDescs self.log.info("Running hadoop commands...") self.__run_hadoop_commands(False) masterParams = [] for k, cmd in self.__running.iteritems(): masterParams.extend(cmd.filledInKeyVals) self.log.debug("printing getparams") self.log.debug(pformat(id)) self.log.debug(pformat(masterParams)) # when this is on a required host, the ringMaster already has our masterParams if(len(masterParams) > 0): ringClient.addMasterParams(id, masterParams) except Exception, e: raise Exception(e)
def allocate(self, clusterDir, min, max=None): status = 0 failureCount = 0 self.__svcrgyClient = self.__get_svcrgy_client() self.__log.debug("allocate %s %s %s" % (clusterDir, min, max)) if min < 3: self.__log.critical("Minimum nodes must be greater than 2.") status = 2 else: nodeSet = self.__nodePool.newNodeSet(min) walltime = None if self.__cfg['hod'].has_key('walltime'): walltime = self.__cfg['hod']['walltime'] self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime) # if the job submission returned an error other than no resources # retry a couple of times while (self.jobId is False) and (exitCode != 188): if hodInterrupt.isSet(): raise HodInterruptException() failureCount += 1 if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']): self.__log.debug("failed submitting job more than the retries. exiting") break else: # wait a bit before retrying time.sleep(self.__cfg['hod']['job-command-failure-interval']) if hodInterrupt.isSet(): raise HodInterruptException() self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime) if self.jobId: jobStatus = None try: jobStatus = self.__check_job_status() except HodInterruptException, h: self.__log.info(HOD_INTERRUPTED_MESG) self.delete_job(self.jobId) self.__log.info("Cluster %s removed from queue." % self.jobId) raise h else: if jobStatus == -1: self.delete_job(self.jobId); status = 4 return status if jobStatus: self.__log.info("Cluster Id %s" \ % self.jobId) try: self.ringmasterXRS = self.__get_ringmaster_client() self.__log.debug("Ringmaster at : %s" % self.ringmasterXRS ) ringClient = None if self.ringmasterXRS: ringClient = hodXRClient(self.ringmasterXRS) hdfsStatus, hdfsAddr, self.hdfsInfo = \ self.__init_hadoop_service('hdfs', ringClient) if hdfsStatus: self.__log.info("HDFS UI at http://%s" % self.hdfsInfo) mapredStatus, mapredAddr, self.mapredInfo = \ self.__init_hadoop_service('mapred', ringClient) if mapredStatus: self.__log.info("Mapred UI at http://%s" % self.mapredInfo) if self.__cfg['hod'].has_key('update-worker-info') \ and self.__cfg['hod']['update-worker-info']: workerInfoMap = {} workerInfoMap['HDFS UI'] = 'http://%s' % self.hdfsInfo workerInfoMap['Mapred UI'] = 'http://%s' % self.mapredInfo # Ringmaster URL sample format : http://hostname:port/ workerInfoMap['RM RPC Port'] = '%s' % self.ringmasterXRS.split(":")[2].strip("/") if mapredAddr.find(':') != -1: workerInfoMap['Mapred RPC Port'] = mapredAddr.split(':')[1] ret = self.__nodePool.updateWorkerInfo(workerInfoMap, self.jobId) if ret != 0: self.__log.warn('Could not update HDFS and Mapred information.' \ 'User Portal may not show relevant information.' \ 'Error code=%s' % ret) self.__cfg.replace_escape_seqs() # Go generate the client side hadoop-site.xml now # adding final-params as well, just so that conf on # client-side and server-side are (almost) the same clientParams = None serverParams = {} finalServerParams = {} # client-params if self.__cfg['hod'].has_key('client-params'): clientParams = self.__cfg['hod']['client-params'] # server-params if self.__cfg['gridservice-mapred'].has_key('server-params'): serverParams.update(\ self.__cfg['gridservice-mapred']['server-params']) if self.__cfg['gridservice-hdfs'].has_key('server-params'): # note that if there are params in both mapred and hdfs # sections, the ones in hdfs overwirte the ones in mapred serverParams.update(\ self.__cfg['gridservice-hdfs']['server-params']) # final-server-params if self.__cfg['gridservice-mapred'].has_key(\ 'final-server-params'): finalServerParams.update(\ self.__cfg['gridservice-mapred']['final-server-params']) if self.__cfg['gridservice-hdfs'].has_key( 'final-server-params'): finalServerParams.update(\ self.__cfg['gridservice-hdfs']['final-server-params']) clusterFactor = self.__cfg['hod']['cluster-factor'] tempDir = self.__cfg['hod']['temp-dir'] if not os.path.exists(tempDir): os.makedirs(tempDir) tempDir = os.path.join( tempDir, self.__cfg['hod']['userid']\ + "." + self.jobId ) mrSysDir = getMapredSystemDirectory(self.__cfg['hodring']['mapred-system-dir-root'],\ self.__cfg['hod']['userid'], self.jobId) self.__hadoopCfg.gen_site_conf(clusterDir, tempDir, min,\ hdfsAddr, mrSysDir, mapredAddr, clientParams,\ serverParams, finalServerParams,\ clusterFactor) self.__log.info("hadoop-site.xml at %s" % clusterDir) # end of hadoop-site.xml generation else: status = 8 else: status = 7 else: status = 6 if status != 0: self.__log.debug("Cleaning up cluster id %s, as cluster could not be allocated." % self.jobId) if ringClient is None: self.delete_job(self.jobId) else: self.__log.debug("Calling rm.stop()") ringClient.stopRM() self.__log.debug("Returning from rm.stop()") except HodInterruptException, h: self.__log.info(HOD_INTERRUPTED_MESG) if self.ringmasterXRS: if ringClient is None: ringClient = hodXRClient(self.ringmasterXRS) self.__log.debug("Calling rm.stop()") ringClient.stopRM() self.__log.debug("Returning from rm.stop()") self.__log.info("Cluster Shutdown by informing ringmaster.") else: self.delete_job(self.jobId) self.__log.info("Cluster %s removed from queue directly." % self.jobId) raise h else:
self.__log.info("Exception in collecting Job tracker logs. Ignoring.") rmAddr = None if clusterInfo.has_key('ring'): # format is http://host:port/ We need host:port rmAddr = clusterInfo['ring'][7:] if rmAddr.endswith('/'): rmAddr = rmAddr[:-1] if (rmAddr is None) or (not self.__isRingMasterAlive(rmAddr)): # Cluster is already dead, don't try to contact ringmaster. self.__nodePool.finalize() status = 10 # As cluster is dead, we just set the status to 'cluster dead'. else: xrsAddr = clusterInfo['ring'] rmClient = hodXRClient(xrsAddr) self.__log.debug('calling rm.stop') rmClient.stopRM() self.__log.debug('completed rm.stop') # cleanup hod temp dirs tempDir = os.path.join( self.__cfg['hod']['temp-dir'], \ self.__cfg['hod']['userid'] + "." + clusterInfo['jobid'] ) if os.path.exists(tempDir): shutil.rmtree(tempDir) return status class hadoopScript: def __init__(self, conf, execDir): self.__environ = os.environ.copy()
def clusterStart(self, initialize=True): """Start a stopped mapreduce/dfs cluster""" if initialize: self.log.debug('clusterStart Method Invoked - Initialize') else: self.log.debug('clusterStart Method Invoked - No Initialize') try: self.log.debug("Creating service registry XML-RPC client.") serviceClient = hodXRClient(to_http_url(self._cfg['svcrgy-addr']), None, None, 0, 0, 0) self.log.info( "Fetching ringmaster information from service registry.") count = 0 ringXRAddress = None while (ringXRAddress == None and count < 3000): ringList = serviceClient.getServiceInfo( self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = count + 1 if ringXRAddress == None: raise Exception( "Could not get ringmaster XML-RPC server address.") self.log.debug("Creating ringmaster XML-RPC client.") ringClient = hodXRClient(ringXRAddress, None, None, 0, 0, 0) id = self.hostname + "_" + str(os.getpid()) cmdlist = [] if initialize: if 'download-addr' in self._cfg: self.__download_package(ringClient) else: self.log.debug("Did not find a download address.") while (cmdlist == []): cmdlist = ringClient.getCommand(id) else: while (cmdlist == []): cmdlist = ringClient.getAdminCommand(id) self.log.debug(pformat(cmdlist)) cmdDescs = [] for cmds in cmdlist: cmdDescs.append(CommandDesc(cmds['dict'], self.log)) self._cfg['commanddesc'] = cmdDescs if initialize: self.log.info("Running hadoop commands again... - Initialize") self.__run_hadoop_commands() masterParams = [] for k, cmd in self.__running.iteritems(): self.log.debug(cmd) masterParams.extend(cmd.filledInKeyVals) self.log.debug("printing getparams") self.log.debug(pformat(id)) self.log.debug(pformat(masterParams)) # when this is on a required host, the ringMaster already has our masterParams if (len(masterParams) > 0): ringClient.addMasterParams(id, masterParams) else: self.log.info( "Running hadoop commands again... - No Initialize") self.__run_hadoop_commands() except: self.log.error(get_exception_string()) return True
def start(self): """Run and maintain hodring commands""" try: if self._cfg.has_key('download-addr'): self._http = threadedHTTPServer('', self._cfg['http-port-range']) self.log.info("Starting http server...") self._http.serve_forever() self.log.debug("http://%s:%d" % (self._http.server_address[0], self._http.server_address[1])) hodBaseService.start(self) ringXRAddress = None if self._cfg.has_key('ringmaster-xrs-addr'): ringXRAddress = "http://%s:%s/" % ( self._cfg['ringmaster-xrs-addr'][0], self._cfg['ringmaster-xrs-addr'][1]) self.log.debug("Ringmaster at %s" % ringXRAddress) self.log.debug("Creating service registry XML-RPC client.") serviceClient = hodXRClient(to_http_url(self._cfg['svcrgy-addr'])) if ringXRAddress == None: self.log.info( "Did not get ringmaster XML-RPC address. Fetching information from service registry." ) ringList = serviceClient.getServiceInfo( self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') self.log.debug(pprint.pformat(ringList)) if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = 0 while (ringXRAddress == None and count < 3000): ringList = serviceClient.getServiceInfo( self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = count + 1 time.sleep(.2) if ringXRAddress == None: raise Exception( "Could not get ringmaster XML-RPC server address.") self.log.debug("Creating ringmaster XML-RPC client.") ringClient = hodXRClient(ringXRAddress) id = self.hostname + "_" + str(os.getpid()) if 'download-addr' in self._cfg: self.__download_package(ringClient) else: self.log.debug("Did not find a download address.") cmdlist = [] firstTime = True increment = 0 hadoopStartupTime = 2 cmdlist = ringClient.getCommand(id) while (cmdlist == []): if firstTime: sleepTime = increment + self._cfg['cmd-retry-initial-time'] + hadoopStartupTime\ + random.uniform(0,self._cfg['cmd-retry-interval']) firstTime = False else: sleepTime = increment + self._cfg['cmd-retry-initial-time'] + \ + random.uniform(0,self._cfg['cmd-retry-interval']) self.log.debug( "Did not get command list. Waiting for %s seconds." % (sleepTime)) time.sleep(sleepTime) increment = increment + 1 cmdlist = ringClient.getCommand(id) self.log.debug(pformat(cmdlist)) cmdDescs = [] for cmds in cmdlist: cmdDescs.append(CommandDesc(cmds['dict'], self.log)) self._cfg['commanddesc'] = cmdDescs self.log.info("Running hadoop commands...") self.__run_hadoop_commands(False) masterParams = [] for k, cmd in self.__running.iteritems(): masterParams.extend(cmd.filledInKeyVals) self.log.debug("printing getparams") self.log.debug(pformat(id)) self.log.debug(pformat(masterParams)) # when this is on a required host, the ringMaster already has our masterParams if (len(masterParams) > 0): ringClient.addMasterParams(id, masterParams) except Exception, e: raise Exception(e)
def __get_svcrgy_client(self): svcrgyUrl = to_http_url(self.__cfg['hod']['xrs-address']) return hodXRClient(svcrgyUrl)
def allocate(self, clusterDir, min, max=None): status = 0 failureCount = 0 self.__svcrgyClient = self.__get_svcrgy_client() self.__log.debug("allocate %s %s %s" % (clusterDir, min, max)) if min < 3: self.__log.critical("Minimum nodes must be greater than 2.") status = 2 else: nodeSet = self.__nodePool.newNodeSet(min) walltime = None if self.__cfg['hod'].has_key('walltime'): walltime = self.__cfg['hod']['walltime'] self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime) # if the job submission returned an error other than no resources # retry a couple of times while (self.jobId is False) and (exitCode != 188): if hodInterrupt.isSet(): raise HodInterruptException() failureCount += 1 if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']): self.__log.debug("failed submitting job more than the retries. exiting") break else: # wait a bit before retrying time.sleep(self.__cfg['hod']['job-command-failure-interval']) if hodInterrupt.isSet(): raise HodInterruptException() self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime) if self.jobId: jobStatus = None try: jobStatus = self.__check_job_status() except HodInterruptException, h: self.__log.info(HOD_INTERRUPTED_MESG) self.delete_job(self.jobId) self.__log.info("Cluster %s removed from queue." % self.jobId) raise h else: if jobStatus == -1: self.delete_job(self.jobId); status = 4 return status if jobStatus: self.__log.info("Cluster Id %s" \ % self.jobId) try: self.ringmasterXRS = self.__get_ringmaster_client() self.__log.debug("Ringmaster at : %s" % self.ringmasterXRS ) ringClient = None if self.ringmasterXRS: ringClient = hodXRClient(self.ringmasterXRS) hdfsStatus, hdfsAddr, self.hdfsInfo = \ self.__init_hadoop_service('hdfs', ringClient) if hdfsStatus: self.__log.info("HDFS UI at http://%s" % self.hdfsInfo) mapredStatus, mapredAddr, self.mapredInfo = \ self.__init_hadoop_service('mapred', ringClient) if mapredStatus: self.__log.info("Mapred UI at http://%s" % self.mapredInfo) if self.__cfg['hod'].has_key('update-worker-info') \ and self.__cfg['hod']['update-worker-info']: workerInfoMap = {} workerInfoMap['HDFS UI'] = 'http://%s' % self.hdfsInfo workerInfoMap['Mapred UI'] = 'http://%s' % self.mapredInfo if mapredAddr.find(':') != -1: workerInfoMap['Mapred RPC Port'] = mapredAddr.split(':')[1] ret = self.__nodePool.updateWorkerInfo(workerInfoMap, self.jobId) if ret != 0: self.__log.warn('Could not update HDFS and Mapred information.' \ 'User Portal may not show relevant information.' \ 'Error code=%s' % ret) self.__cfg.replace_escape_seqs() # Go generate the client side hadoop-site.xml now # adding final-params as well, just so that conf on # client-side and server-side are (almost) the same clientParams = None serverParams = {} finalServerParams = {} # client-params if self.__cfg['hod'].has_key('client-params'): clientParams = self.__cfg['hod']['client-params'] # server-params if self.__cfg['gridservice-mapred'].has_key('server-params'): serverParams.update(\ self.__cfg['gridservice-mapred']['server-params']) if self.__cfg['gridservice-hdfs'].has_key('server-params'): # note that if there are params in both mapred and hdfs # sections, the ones in hdfs overwirte the ones in mapred serverParams.update(\ self.__cfg['gridservice-hdfs']['server-params']) # final-server-params if self.__cfg['gridservice-mapred'].has_key(\ 'final-server-params'): finalServerParams.update(\ self.__cfg['gridservice-mapred']['final-server-params']) if self.__cfg['gridservice-hdfs'].has_key( 'final-server-params'): finalServerParams.update(\ self.__cfg['gridservice-hdfs']['final-server-params']) clusterFactor = self.__cfg['hod']['cluster-factor'] tempDir = self.__cfg['hod']['temp-dir'] if not os.path.exists(tempDir): os.makedirs(tempDir) tempDir = os.path.join( tempDir, self.__cfg['hod']['userid']\ + "." + self.jobId ) mrSysDir = getMapredSystemDirectory(self.__cfg['hodring']['mapred-system-dir-root'],\ self.__cfg['hod']['userid'], self.jobId) self.__hadoopCfg.gen_site_conf(clusterDir, tempDir, min,\ hdfsAddr, mrSysDir, mapredAddr, clientParams,\ serverParams, finalServerParams,\ clusterFactor) self.__log.info("hadoop-site.xml at %s" % clusterDir) # end of hadoop-site.xml generation else: status = 8 else: status = 7 else: status = 6 if status != 0: self.__log.debug("Cleaning up cluster id %s, as cluster could not be allocated." % self.jobId) if ringClient is None: self.delete_job(self.jobId) else: self.__log.debug("Calling rm.stop()") ringClient.stopRM() self.__log.debug("Returning from rm.stop()") except HodInterruptException, h: self.__log.info(HOD_INTERRUPTED_MESG) if self.ringmasterXRS: if ringClient is None: ringClient = hodXRClient(self.ringmasterXRS) self.__log.debug("Calling rm.stop()") ringClient.stopRM() self.__log.debug("Returning from rm.stop()") self.__log.info("Cluster Shutdown by informing ringmaster.") else: self.delete_job(self.jobId) self.__log.info("Cluster %s removed from queue directly." % self.jobId) raise h else:
def __init__(self, cfg, log, **kwds): """starts nodepool and services""" self.download = False self.httpServer = None self.cfg = cfg self.log = log self.__hostname = local_fqdn() self.workDirs = None # ref to the idle job tracker object. self.__jtMonitor = None self.__idlenessDetected = False self.__stopInProgress = False self.__isStopped = False # to let main exit self.__exitCode = 0 # exit code with which the ringmaster main method should return self.workers_per_ring = self.cfg['ringmaster']['workers_per_ring'] self.__initialize_signal_handlers() sdd = self.cfg['servicedesc'] gsvc = None for key in sdd: gsvc = sdd[key] break npd = self.cfg['nodepooldesc'] self.np = NodePoolUtil.getNodePool(npd, cfg, log) self.log.debug("Getting service ID.") self.serviceId = self.np.getServiceId() self.log.debug("Got service ID: %s" % self.serviceId) self.tarSrcLoc = None if self.cfg['ringmaster'].has_key('hadoop-tar-ball'): self.download = True self.tarSrcLoc = self.cfg['ringmaster']['hadoop-tar-ball'] self.cd_to_tempdir() if (self.download): self.__copy_tarball(os.getcwd()) self.basename = self.__find_tarball_in_dir(os.getcwd()) if self.basename is None: raise Exception('Did not find tarball copied from %s in %s.' % (self.tarSrcLoc, os.getcwd())) self.serviceAddr = to_http_url(self.cfg['ringmaster']['svcrgy-addr']) self.log.debug("Service registry @ %s" % self.serviceAddr) self.serviceClient = hodXRClient(self.serviceAddr) self.serviceDict = {} try: sdl = self.cfg['servicedesc'] workDirs = self.getWorkDirs(cfg) hdfsDesc = sdl['hdfs'] hdfs = None # Determine hadoop Version hadoopVers = hadoopVersion(self.__getHadoopDir(), \ self.cfg['hodring']['java-home'], self.log) if (hadoopVers['major'] == None) or (hadoopVers['minor'] == None): raise Exception( 'Could not retrive the version of Hadoop.' + ' Check the Hadoop installation or the value of the hodring.java-home variable.' ) if hdfsDesc.isExternal(): hdfs = HdfsExternal(hdfsDesc, workDirs, version=int(hadoopVers['minor'])) hdfs.setMasterParams(self.cfg['gridservice-hdfs']) else: hdfs = Hdfs(hdfsDesc, workDirs, 0, version=int(hadoopVers['minor']), workers_per_ring=self.workers_per_ring) self.serviceDict[hdfs.getName()] = hdfs mrDesc = sdl['mapred'] mr = None if mrDesc.isExternal(): mr = MapReduceExternal(mrDesc, workDirs, version=int(hadoopVers['minor'])) mr.setMasterParams(self.cfg['gridservice-mapred']) else: mr = MapReduce(mrDesc, workDirs, 1, version=int(hadoopVers['minor']), workers_per_ring=self.workers_per_ring) self.serviceDict[mr.getName()] = mr except: self.log.critical( "Exception in creating Hdfs and Map/Reduce descriptor objects: \ %s." % get_exception_error_string()) self.log.debug(get_exception_string()) raise # should not be starting these in a constructor ringMasterServer.startService(self.serviceDict, cfg, self.np, log, self) self.rpcserver = ringMasterServer.getAddress() self.httpAddress = None self.tarAddress = None hostname = socket.gethostname() if (self.download): self.httpServer = threadedHTTPServer( hostname, self.cfg['ringmaster']['http-port-range']) self.httpServer.serve_forever() self.httpAddress = "http://%s:%d/" % ( self.httpServer.server_address[0], self.httpServer.server_address[1]) self.tarAddress = "%s%s" % (self.httpAddress, self.basename) ringMasterServer.instance.logMasterSources.registerTarSource( hostname, self.tarAddress) else: self.log.debug("Download not set.") self.log.debug("%s %s %s %s %s" % (self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod')) if self.cfg['ringmaster']['register']: if self.httpAddress: self.serviceClient.registerService( self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod', { 'xrs': self.rpcserver, 'http': self.httpAddress }) else: self.serviceClient.registerService( self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod', { 'xrs': self.rpcserver, }) self.log.debug("Registered with serivce registry: %s." % self.serviceAddr) hodRingPath = os.path.join(cfg['ringmaster']['base-dir'], 'bin', 'hodring') hodRingWorkDir = os.path.join(cfg['hodring']['temp-dir'], 'hodring' + '_' + getpass.getuser()) self.cfg['hodring']['hodring'] = [ hodRingWorkDir, ] self.cfg['hodring']['svcrgy-addr'] = self.cfg['ringmaster'][ 'svcrgy-addr'] self.cfg['hodring']['service-id'] = self.np.getServiceId() self.cfg['hodring']['ringmaster-xrs-addr'] = self.__url_to_addr( self.rpcserver) if (self.tarSrcLoc != None): cfg['hodring']['download-addr'] = self.tarAddress self.__init_job_tracker_monitor( ringMasterServer.instance.logMasterSources)
def testSuccess(self): global serverPort client = hodXRClient('http://localhost:' + str(serverPort), retryRequests=False) self.assertEqual(client.testing(), True) pass
def clusterStart(self, initialize=True): """Start a stopped mapreduce/dfs cluster""" if initialize: self.log.debug('clusterStart Method Invoked - Initialize') else: self.log.debug('clusterStart Method Invoked - No Initialize') try: self.log.debug("Creating service registry XML-RPC client.") serviceClient = hodXRClient(to_http_url(self._cfg['svcrgy-addr']), None, None, 0, 0, 0) self.log.info("Fetching ringmaster information from service registry.") count = 0 ringXRAddress = None while (ringXRAddress == None and count < 3000): ringList = serviceClient.getServiceInfo(self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = count + 1 if ringXRAddress == None: raise Exception("Could not get ringmaster XML-RPC server address.") self.log.debug("Creating ringmaster XML-RPC client.") ringClient = hodXRClient(ringXRAddress, None, None, 0, 0, 0) id = self.hostname + "_" + str(os.getpid()) cmdlist = [] if initialize: if 'download-addr' in self._cfg: self.__download_package(ringClient) else: self.log.debug("Did not find a download address.") while (cmdlist == []): cmdlist = ringClient.getCommand(id) else: while (cmdlist == []): cmdlist = ringClient.getAdminCommand(id) self.log.debug(pformat(cmdlist)) cmdDescs = [] for cmds in cmdlist: cmdDescs.append(CommandDesc(cmds['dict'], self.log)) self._cfg['commanddesc'] = cmdDescs if initialize: self.log.info("Running hadoop commands again... - Initialize") self.__run_hadoop_commands() masterParams = [] for k, cmd in self.__running.iteritems(): self.log.debug(cmd) masterParams.extend(cmd.filledInKeyVals) self.log.debug("printing getparams") self.log.debug(pformat(id)) self.log.debug(pformat(masterParams)) # when this is on a required host, the ringMaster already has our masterParams if(len(masterParams) > 0): ringClient.addMasterParams(id, masterParams) else: self.log.info("Running hadoop commands again... - No Initialize") self.__run_hadoop_commands() except: self.log.error(get_exception_string()) return True