def start(self): """Run and maintain hodring commands""" try: if self._cfg.has_key('download-addr'): self._http = threadedHTTPServer('', self._cfg['http-port-range']) self.log.info("Starting http server...") self._http.serve_forever() self.log.debug("http://%s:%d" % (self._http.server_address[0], self._http.server_address[1])) hodBaseService.start(self) ringXRAddress = None if self._cfg.has_key('ringmaster-xrs-addr'): ringXRAddress = "http://%s:%s/" % (self._cfg['ringmaster-xrs-addr'][0], self._cfg['ringmaster-xrs-addr'][1]) self.log.debug("Ringmaster at %s" % ringXRAddress) self.log.debug("Creating service registry XML-RPC client.") serviceClient = hodXRClient(to_http_url( self._cfg['svcrgy-addr'])) if ringXRAddress == None: self.log.info("Did not get ringmaster XML-RPC address. Fetching information from service registry.") ringList = serviceClient.getServiceInfo(self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') self.log.debug(pprint.pformat(ringList)) if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = 0 while (ringXRAddress == None and count < 3000): ringList = serviceClient.getServiceInfo(self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = count + 1 time.sleep(.2) if ringXRAddress == None: raise Exception("Could not get ringmaster XML-RPC server address.") self.log.debug("Creating ringmaster XML-RPC client.") ringClient = hodXRClient(ringXRAddress) id = self.hostname + "_" + str(os.getpid()) if 'download-addr' in self._cfg: self.__download_package(ringClient) else: self.log.debug("Did not find a download address.") cmdlist = [] firstTime = True increment = 0 hadoopStartupTime = 2 cmdlist = ringClient.getCommand(id) while (cmdlist == []): if firstTime: sleepTime = increment + self._cfg['cmd-retry-initial-time'] + hadoopStartupTime\ + random.uniform(0,self._cfg['cmd-retry-interval']) firstTime = False else: sleepTime = increment + self._cfg['cmd-retry-initial-time'] + \ + random.uniform(0,self._cfg['cmd-retry-interval']) self.log.debug("Did not get command list. Waiting for %s seconds." % (sleepTime)) time.sleep(sleepTime) increment = increment + 1 cmdlist = ringClient.getCommand(id) self.log.debug(pformat(cmdlist)) cmdDescs = [] for cmds in cmdlist: cmdDescs.append(CommandDesc(cmds['dict'], self.log)) self._cfg['commanddesc'] = cmdDescs self.log.info("Running hadoop commands...") self.__run_hadoop_commands(False) masterParams = [] for k, cmd in self.__running.iteritems(): masterParams.extend(cmd.filledInKeyVals) self.log.debug("printing getparams") self.log.debug(pformat(id)) self.log.debug(pformat(masterParams)) # when this is on a required host, the ringMaster already has our masterParams if(len(masterParams) > 0): ringClient.addMasterParams(id, masterParams) except Exception, e: raise Exception(e)
def __init__(self, cfg, log, **kwds): """starts nodepool and services""" self.download = False self.httpServer = None self.cfg = cfg self.log = log self.__hostname = local_fqdn() self.workDirs = None # ref to the idle job tracker object. self.__jtMonitor = None self.__idlenessDetected = False self.__stopInProgress = False self.__isStopped = False # to let main exit self.__exitCode = 0 # exit code with which the ringmaster main method should return self.workers_per_ring = self.cfg["ringmaster"]["workers_per_ring"] self.__initialize_signal_handlers() sdd = self.cfg["servicedesc"] gsvc = None for key in sdd: gsvc = sdd[key] break npd = self.cfg["nodepooldesc"] self.np = NodePoolUtil.getNodePool(npd, cfg, log) self.log.debug("Getting service ID.") self.serviceId = self.np.getServiceId() self.log.debug("Got service ID: %s" % self.serviceId) self.tarSrcLoc = None if self.cfg["ringmaster"].has_key("hadoop-tar-ball"): self.download = True self.tarSrcLoc = self.cfg["ringmaster"]["hadoop-tar-ball"] self.cd_to_tempdir() if self.download: self.__copy_tarball(os.getcwd()) self.basename = self.__find_tarball_in_dir(os.getcwd()) if self.basename is None: raise Exception("Did not find tarball copied from %s in %s." % (self.tarSrcLoc, os.getcwd())) self.serviceAddr = to_http_url(self.cfg["ringmaster"]["svcrgy-addr"]) self.log.debug("Service registry @ %s" % self.serviceAddr) self.serviceClient = hodXRClient(self.serviceAddr) self.serviceDict = {} try: sdl = self.cfg["servicedesc"] workDirs = self.getWorkDirs(cfg) hdfsDesc = sdl["hdfs"] hdfs = None # Determine hadoop Version hadoopVers = hadoopVersion(self.__getHadoopDir(), self.cfg["hodring"]["java-home"], self.log) if (hadoopVers["major"] == None) or (hadoopVers["minor"] == None): raise Exception( "Could not retrive the version of Hadoop." + " Check the Hadoop installation or the value of the hodring.java-home variable." ) if hdfsDesc.isExternal(): hdfs = HdfsExternal(hdfsDesc, workDirs, version=int(hadoopVers["minor"])) hdfs.setMasterParams(self.cfg["gridservice-hdfs"]) else: hdfs = Hdfs( hdfsDesc, workDirs, 0, version=int(hadoopVers["minor"]), workers_per_ring=self.workers_per_ring ) self.serviceDict[hdfs.getName()] = hdfs mrDesc = sdl["mapred"] mr = None if mrDesc.isExternal(): mr = MapReduceExternal(mrDesc, workDirs, version=int(hadoopVers["minor"])) mr.setMasterParams(self.cfg["gridservice-mapred"]) else: mr = MapReduce( mrDesc, workDirs, 1, version=int(hadoopVers["minor"]), workers_per_ring=self.workers_per_ring ) self.serviceDict[mr.getName()] = mr except: self.log.critical( "Exception in creating Hdfs and Map/Reduce descriptor objects: \ %s." % get_exception_error_string() ) self.log.debug(get_exception_string()) raise # should not be starting these in a constructor ringMasterServer.startService(self.serviceDict, cfg, self.np, log, self) self.rpcserver = ringMasterServer.getAddress() self.httpAddress = None self.tarAddress = None hostname = socket.gethostname() if self.download: self.httpServer = threadedHTTPServer(hostname, self.cfg["ringmaster"]["http-port-range"]) self.httpServer.serve_forever() self.httpAddress = "http://%s:%d/" % (self.httpServer.server_address[0], self.httpServer.server_address[1]) self.tarAddress = "%s%s" % (self.httpAddress, self.basename) ringMasterServer.instance.logMasterSources.registerTarSource(hostname, self.tarAddress) else: self.log.debug("Download not set.") self.log.debug( "%s %s %s %s %s" % (self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod") ) if self.cfg["ringmaster"]["register"]: if self.httpAddress: self.serviceClient.registerService( self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod", {"xrs": self.rpcserver, "http": self.httpAddress}, ) else: self.serviceClient.registerService( self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod", {"xrs": self.rpcserver}, ) self.log.debug("Registered with serivce registry: %s." % self.serviceAddr) hodRingPath = os.path.join(cfg["ringmaster"]["base-dir"], "bin", "hodring") hodRingWorkDir = os.path.join(cfg["hodring"]["temp-dir"], "hodring" + "_" + getpass.getuser()) self.cfg["hodring"]["hodring"] = [hodRingWorkDir] self.cfg["hodring"]["svcrgy-addr"] = self.cfg["ringmaster"]["svcrgy-addr"] self.cfg["hodring"]["service-id"] = self.np.getServiceId() self.cfg["hodring"]["ringmaster-xrs-addr"] = self.__url_to_addr(self.rpcserver) if self.tarSrcLoc != None: cfg["hodring"]["download-addr"] = self.tarAddress self.__init_job_tracker_monitor(ringMasterServer.instance.logMasterSources)
def __init__(self, cfg, log, **kwds): """starts nodepool and services""" self.download = False self.httpServer = None self.cfg = cfg self.log = log self.__hostname = local_fqdn() self.workDirs = None # ref to the idle job tracker object. self.__jtMonitor = None self.__idlenessDetected = False self.__stopInProgress = False self.__isStopped = False # to let main exit self.__exitCode = 0 # exit code with which the ringmaster main method should return self.workers_per_ring = self.cfg['ringmaster']['workers_per_ring'] self.__initialize_signal_handlers() sdd = self.cfg['servicedesc'] gsvc = None for key in sdd: gsvc = sdd[key] break npd = self.cfg['nodepooldesc'] self.np = NodePoolUtil.getNodePool(npd, cfg, log) self.log.debug("Getting service ID.") self.serviceId = self.np.getServiceId() self.log.debug("Got service ID: %s" % self.serviceId) self.tarSrcLoc = None if self.cfg['ringmaster'].has_key('hadoop-tar-ball'): self.download = True self.tarSrcLoc = self.cfg['ringmaster']['hadoop-tar-ball'] self.cd_to_tempdir() if (self.download): self.__copy_tarball(os.getcwd()) self.basename = self.__find_tarball_in_dir(os.getcwd()) if self.basename is None: raise Exception('Did not find tarball copied from %s in %s.' % (self.tarSrcLoc, os.getcwd())) self.serviceAddr = to_http_url(self.cfg['ringmaster']['svcrgy-addr']) self.log.debug("Service registry @ %s" % self.serviceAddr) self.serviceClient = hodXRClient(self.serviceAddr) self.serviceDict = {} try: sdl = self.cfg['servicedesc'] workDirs = self.getWorkDirs(cfg) hdfsDesc = sdl['hdfs'] hdfs = None # Determine hadoop Version hadoopVers = hadoopVersion(self.__getHadoopDir(), \ self.cfg['hodring']['java-home'], self.log) if (hadoopVers['major'] == None) or (hadoopVers['minor'] == None): raise Exception( 'Could not retrive the version of Hadoop.' + ' Check the Hadoop installation or the value of the hodring.java-home variable.' ) if hdfsDesc.isExternal(): hdfs = HdfsExternal(hdfsDesc, workDirs, version=int(hadoopVers['minor'])) hdfs.setMasterParams(self.cfg['gridservice-hdfs']) else: hdfs = Hdfs(hdfsDesc, workDirs, 0, version=int(hadoopVers['minor']), workers_per_ring=self.workers_per_ring) self.serviceDict[hdfs.getName()] = hdfs mrDesc = sdl['mapred'] mr = None if mrDesc.isExternal(): mr = MapReduceExternal(mrDesc, workDirs, version=int(hadoopVers['minor'])) mr.setMasterParams(self.cfg['gridservice-mapred']) else: mr = MapReduce(mrDesc, workDirs, 1, version=int(hadoopVers['minor']), workers_per_ring=self.workers_per_ring) self.serviceDict[mr.getName()] = mr except: self.log.critical( "Exception in creating Hdfs and Map/Reduce descriptor objects: \ %s." % get_exception_error_string()) self.log.debug(get_exception_string()) raise # should not be starting these in a constructor ringMasterServer.startService(self.serviceDict, cfg, self.np, log, self) self.rpcserver = ringMasterServer.getAddress() self.httpAddress = None self.tarAddress = None hostname = socket.gethostname() if (self.download): self.httpServer = threadedHTTPServer( hostname, self.cfg['ringmaster']['http-port-range']) self.httpServer.serve_forever() self.httpAddress = "http://%s:%d/" % ( self.httpServer.server_address[0], self.httpServer.server_address[1]) self.tarAddress = "%s%s" % (self.httpAddress, self.basename) ringMasterServer.instance.logMasterSources.registerTarSource( hostname, self.tarAddress) else: self.log.debug("Download not set.") self.log.debug("%s %s %s %s %s" % (self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod')) if self.cfg['ringmaster']['register']: if self.httpAddress: self.serviceClient.registerService( self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod', { 'xrs': self.rpcserver, 'http': self.httpAddress }) else: self.serviceClient.registerService( self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod', { 'xrs': self.rpcserver, }) self.log.debug("Registered with serivce registry: %s." % self.serviceAddr) hodRingPath = os.path.join(cfg['ringmaster']['base-dir'], 'bin', 'hodring') hodRingWorkDir = os.path.join(cfg['hodring']['temp-dir'], 'hodring' + '_' + getpass.getuser()) self.cfg['hodring']['hodring'] = [ hodRingWorkDir, ] self.cfg['hodring']['svcrgy-addr'] = self.cfg['ringmaster'][ 'svcrgy-addr'] self.cfg['hodring']['service-id'] = self.np.getServiceId() self.cfg['hodring']['ringmaster-xrs-addr'] = self.__url_to_addr( self.rpcserver) if (self.tarSrcLoc != None): cfg['hodring']['download-addr'] = self.tarAddress self.__init_job_tracker_monitor( ringMasterServer.instance.logMasterSources)
def start(self): """Run and maintain hodring commands""" try: if self._cfg.has_key('download-addr'): self._http = threadedHTTPServer('', self._cfg['http-port-range']) self.log.info("Starting http server...") self._http.serve_forever() self.log.debug("http://%s:%d" % (self._http.server_address[0], self._http.server_address[1])) hodBaseService.start(self) ringXRAddress = None if self._cfg.has_key('ringmaster-xrs-addr'): ringXRAddress = "http://%s:%s/" % ( self._cfg['ringmaster-xrs-addr'][0], self._cfg['ringmaster-xrs-addr'][1]) self.log.debug("Ringmaster at %s" % ringXRAddress) self.log.debug("Creating service registry XML-RPC client.") serviceClient = hodXRClient(to_http_url(self._cfg['svcrgy-addr'])) if ringXRAddress == None: self.log.info( "Did not get ringmaster XML-RPC address. Fetching information from service registry." ) ringList = serviceClient.getServiceInfo( self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') self.log.debug(pprint.pformat(ringList)) if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = 0 while (ringXRAddress == None and count < 3000): ringList = serviceClient.getServiceInfo( self._cfg['userid'], self._cfg['service-id'], 'ringmaster', 'hod') if len(ringList): if isinstance(ringList, list): ringXRAddress = ringList[0]['xrs'] count = count + 1 time.sleep(.2) if ringXRAddress == None: raise Exception( "Could not get ringmaster XML-RPC server address.") self.log.debug("Creating ringmaster XML-RPC client.") ringClient = hodXRClient(ringXRAddress) id = self.hostname + "_" + str(os.getpid()) if 'download-addr' in self._cfg: self.__download_package(ringClient) else: self.log.debug("Did not find a download address.") cmdlist = [] firstTime = True increment = 0 hadoopStartupTime = 2 cmdlist = ringClient.getCommand(id) while (cmdlist == []): if firstTime: sleepTime = increment + self._cfg['cmd-retry-initial-time'] + hadoopStartupTime\ + random.uniform(0,self._cfg['cmd-retry-interval']) firstTime = False else: sleepTime = increment + self._cfg['cmd-retry-initial-time'] + \ + random.uniform(0,self._cfg['cmd-retry-interval']) self.log.debug( "Did not get command list. Waiting for %s seconds." % (sleepTime)) time.sleep(sleepTime) increment = increment + 1 cmdlist = ringClient.getCommand(id) self.log.debug(pformat(cmdlist)) cmdDescs = [] for cmds in cmdlist: cmdDescs.append(CommandDesc(cmds['dict'], self.log)) self._cfg['commanddesc'] = cmdDescs self.log.info("Running hadoop commands...") self.__run_hadoop_commands(False) masterParams = [] for k, cmd in self.__running.iteritems(): masterParams.extend(cmd.filledInKeyVals) self.log.debug("printing getparams") self.log.debug(pformat(id)) self.log.debug(pformat(masterParams)) # when this is on a required host, the ringMaster already has our masterParams if (len(masterParams) > 0): ringClient.addMasterParams(id, masterParams) except Exception, e: raise Exception(e)