def __init_job_tracker_monitor(self, logMasterSources): hadoopDir = self.__getHadoopDir() self.log.debug('hadoopdir=%s, java-home=%s' % \ (hadoopDir, self.cfg['hodring']['java-home'])) try: self.__jtMonitor = JobTrackerMonitor( self.log, self, self.cfg['ringmaster']['jt-poll-interval'], self.cfg['ringmaster']['idleness-limit'], hadoopDir, self.cfg['hodring']['java-home'], logMasterSources) self.log.debug('starting jt monitor') self.__jtMonitor.start() except: self.log.critical( 'Exception in running idle job tracker. This cluster cannot be deallocated if idle.\ Exception message: %s' % get_exception_error_string()) self.log.debug('Exception details: %s' % get_exception_string())
def __init_job_tracker_monitor(self, logMasterSources): hadoopDir = self.__getHadoopDir() self.log.debug('hadoopdir=%s, java-home=%s' % \ (hadoopDir, self.cfg['hodring']['java-home'])) try: self.__jtMonitor = JobTrackerMonitor(self.log, self, self.cfg['ringmaster']['jt-poll-interval'], self.cfg['ringmaster']['idleness-limit'], hadoopDir, self.cfg['hodring']['java-home'], logMasterSources) self.log.debug('starting jt monitor') self.__jtMonitor.start() except: self.log.critical('Exception in running idle job tracker. This cluster cannot be deallocated if idle.\ Exception message: %s' % get_exception_error_string()) self.log.debug('Exception details: %s' % get_exception_string())
def __init_job_tracker_monitor(self, logMasterSources): hadoopDir = self.__getHadoopDir() self.log.debug("hadoopdir=%s, java-home=%s" % (hadoopDir, self.cfg["hodring"]["java-home"])) try: self.__jtMonitor = JobTrackerMonitor( self.log, self, self.cfg["ringmaster"]["jt-poll-interval"], self.cfg["ringmaster"]["idleness-limit"], hadoopDir, self.cfg["hodring"]["java-home"], logMasterSources, ) self.log.debug("starting jt monitor") self.__jtMonitor.start() except: self.log.critical( "Exception in running idle job tracker. This cluster cannot be deallocated if idle.\ Exception message: %s" % get_exception_error_string() ) self.log.debug("Exception details: %s" % get_exception_string())
class RingMaster: def __init__(self, cfg, log, **kwds): """starts nodepool and services""" self.download = False self.httpServer = None self.cfg = cfg self.log = log self.__hostname = local_fqdn() self.workDirs = None # ref to the idle job tracker object. self.__jtMonitor = None self.__idlenessDetected = False self.__stopInProgress = False self.__isStopped = False # to let main exit self.__exitCode = 0 # exit code with which the ringmaster main method should return self.workers_per_ring = self.cfg['ringmaster']['workers_per_ring'] self.__initialize_signal_handlers() sdd = self.cfg['servicedesc'] gsvc = None for key in sdd: gsvc = sdd[key] break npd = self.cfg['nodepooldesc'] self.np = NodePoolUtil.getNodePool(npd, cfg, log) self.log.debug("Getting service ID.") self.serviceId = self.np.getServiceId() self.log.debug("Got service ID: %s" % self.serviceId) self.tarSrcLoc = None if self.cfg['ringmaster'].has_key('hadoop-tar-ball'): self.download = True self.tarSrcLoc = self.cfg['ringmaster']['hadoop-tar-ball'] self.cd_to_tempdir() if (self.download): self.__copy_tarball(os.getcwd()) self.basename = self.__find_tarball_in_dir(os.getcwd()) if self.basename is None: raise Exception('Did not find tarball copied from %s in %s.' % (self.tarSrcLoc, os.getcwd())) self.serviceAddr = to_http_url(self.cfg['ringmaster']['svcrgy-addr']) self.log.debug("Service registry @ %s" % self.serviceAddr) self.serviceClient = hodXRClient(self.serviceAddr) self.serviceDict = {} try: sdl = self.cfg['servicedesc'] workDirs = self.getWorkDirs(cfg) hdfsDesc = sdl['hdfs'] hdfs = None # Determine hadoop Version hadoopVers = hadoopVersion(self.__getHadoopDir(), \ self.cfg['hodring']['java-home'], self.log) if (hadoopVers['major'] == None) or (hadoopVers['minor'] == None): raise Exception( 'Could not retrive the version of Hadoop.' + ' Check the Hadoop installation or the value of the hodring.java-home variable.' ) if hdfsDesc.isExternal(): hdfs = HdfsExternal(hdfsDesc, workDirs, version=int(hadoopVers['minor'])) hdfs.setMasterParams(self.cfg['gridservice-hdfs']) else: hdfs = Hdfs(hdfsDesc, workDirs, 0, version=int(hadoopVers['minor']), workers_per_ring=self.workers_per_ring) self.serviceDict[hdfs.getName()] = hdfs mrDesc = sdl['mapred'] mr = None if mrDesc.isExternal(): mr = MapReduceExternal(mrDesc, workDirs, version=int(hadoopVers['minor'])) mr.setMasterParams(self.cfg['gridservice-mapred']) else: mr = MapReduce(mrDesc, workDirs, 1, version=int(hadoopVers['minor']), workers_per_ring=self.workers_per_ring) self.serviceDict[mr.getName()] = mr except: self.log.critical( "Exception in creating Hdfs and Map/Reduce descriptor objects: \ %s." % get_exception_error_string()) self.log.debug(get_exception_string()) raise # should not be starting these in a constructor ringMasterServer.startService(self.serviceDict, cfg, self.np, log, self) self.rpcserver = ringMasterServer.getAddress() self.httpAddress = None self.tarAddress = None hostname = socket.gethostname() if (self.download): self.httpServer = threadedHTTPServer( hostname, self.cfg['ringmaster']['http-port-range']) self.httpServer.serve_forever() self.httpAddress = "http://%s:%d/" % ( self.httpServer.server_address[0], self.httpServer.server_address[1]) self.tarAddress = "%s%s" % (self.httpAddress, self.basename) ringMasterServer.instance.logMasterSources.registerTarSource( hostname, self.tarAddress) else: self.log.debug("Download not set.") self.log.debug("%s %s %s %s %s" % (self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod')) if self.cfg['ringmaster']['register']: if self.httpAddress: self.serviceClient.registerService( self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod', { 'xrs': self.rpcserver, 'http': self.httpAddress }) else: self.serviceClient.registerService( self.cfg['ringmaster']['userid'], self.serviceId, self.__hostname, 'ringmaster', 'hod', { 'xrs': self.rpcserver, }) self.log.debug("Registered with serivce registry: %s." % self.serviceAddr) hodRingPath = os.path.join(cfg['ringmaster']['base-dir'], 'bin', 'hodring') hodRingWorkDir = os.path.join(cfg['hodring']['temp-dir'], 'hodring' + '_' + getpass.getuser()) self.cfg['hodring']['hodring'] = [ hodRingWorkDir, ] self.cfg['hodring']['svcrgy-addr'] = self.cfg['ringmaster'][ 'svcrgy-addr'] self.cfg['hodring']['service-id'] = self.np.getServiceId() self.cfg['hodring']['ringmaster-xrs-addr'] = self.__url_to_addr( self.rpcserver) if (self.tarSrcLoc != None): cfg['hodring']['download-addr'] = self.tarAddress self.__init_job_tracker_monitor( ringMasterServer.instance.logMasterSources) def __init_job_tracker_monitor(self, logMasterSources): hadoopDir = self.__getHadoopDir() self.log.debug('hadoopdir=%s, java-home=%s' % \ (hadoopDir, self.cfg['hodring']['java-home'])) try: self.__jtMonitor = JobTrackerMonitor( self.log, self, self.cfg['ringmaster']['jt-poll-interval'], self.cfg['ringmaster']['idleness-limit'], hadoopDir, self.cfg['hodring']['java-home'], logMasterSources) self.log.debug('starting jt monitor') self.__jtMonitor.start() except: self.log.critical( 'Exception in running idle job tracker. This cluster cannot be deallocated if idle.\ Exception message: %s' % get_exception_error_string()) self.log.debug('Exception details: %s' % get_exception_string()) def __getHadoopDir(self): hadoopDir = None if self.cfg['ringmaster'].has_key('hadoop-tar-ball'): tarFile = os.path.join(os.getcwd(), self.basename) ret = untar(tarFile, os.getcwd()) if not ret: raise Exception('Untarring tarfile %s to directory %s failed. Cannot find hadoop directory.' \ % (tarFile, os.getcwd())) hadoopDir = os.path.join(os.getcwd(), self.__get_dir(tarFile)) else: hadoopDir = self.cfg['gridservice-mapred']['pkgs'] self.log.debug('Returning Hadoop directory as: %s' % hadoopDir) return hadoopDir def __get_dir(self, name): """Return the root directory inside the tarball specified by name. Assumes that the tarball begins with a root directory.""" import tarfile myTarFile = tarfile.open(name) hadoopPackage = myTarFile.getnames()[0] self.log.debug("tarball name : %s hadoop package name : %s" % (name, hadoopPackage)) return hadoopPackage def __find_tarball_in_dir(self, dir): """Find the tarball among files specified in the given directory. We need this method because how the tarball source URI is given depends on the method of copy and we can't get the tarball name from that. This method will fail if there are multiple tarballs in the directory with the same suffix.""" files = os.listdir(dir) for file in files: if self.tarSrcLoc.endswith(file): return file return None def __copy_tarball(self, destDir): """Copy the hadoop tar ball from a remote location to the specified destination directory. Based on the URL it executes an appropriate copy command. Throws an exception if the command returns a non-zero exit code.""" # for backwards compatibility, treat the default case as file:// url = '' if self.tarSrcLoc.startswith('/'): url = 'file:/' src = '%s%s' % (url, self.tarSrcLoc) if src.startswith('file://'): src = src[len('file://') - 1:] cpCmd = '/bin/cp' cmd = '%s %s %s' % (cpCmd, src, destDir) self.log.debug('Command to execute: %s' % cmd) copyProc = simpleCommand('remote copy', cmd) copyProc.start() copyProc.wait() copyProc.join() ret = copyProc.exit_code() self.log.debug('Completed command execution. Exit Code: %s.' % ret) if ret != 0: output = copyProc.output() raise Exception( 'Could not copy tarball using command %s. Exit code: %s. Output: %s' % (cmd, ret, output)) else: raise Exception('Unsupported URL for file: %s' % src) # input: http://hostname:port/. output: [hostname,port] def __url_to_addr(self, url): addr = url.rstrip('/') if addr.startswith('http://'): addr = addr.replace('http://', '', 1) addr_parts = addr.split(':') return [addr_parts[0], int(addr_parts[1])] def __initialize_signal_handlers(self): def sigStop(sigNum, handler): sig_wrapper(sigNum, self.stop) signal.signal(signal.SIGTERM, sigStop) signal.signal(signal.SIGINT, sigStop) signal.signal(signal.SIGQUIT, sigStop) def __clean_up(self): tempDir = self.__get_tempdir() os.chdir(os.path.split(tempDir)[0]) if os.path.exists(tempDir): shutil.rmtree(tempDir, True) self.log.debug("Cleaned up temporary dir: %s" % tempDir) def __get_tempdir(self): dir = os.path.join( self.cfg['ringmaster']['temp-dir'], "%s.%s.ringmaster" % (self.cfg['ringmaster']['userid'], self.np.getServiceId())) return dir def getWorkDirs(self, cfg, reUse=False): if (not reUse) or (self.workDirs == None): import math frand = random.random() while math.ceil(frand) != math.floor(frand): frand = frand * 100 irand = int(frand) uniq = '%s-%d-%s' % (socket.gethostname(), os.getpid(), irand) dirs = [] parentDirs = cfg['ringmaster']['work-dirs'] for p in parentDirs: dir = os.path.join(p, uniq) dirs.append(dir) self.workDirs = dirs return self.workDirs def _fetchLink(self, link, parentDir): parser = miniHTMLParser() self.log.debug("Checking link %s" % link) while link: # Get the file from the site and link input = urllib.urlopen(link) out = None contentType = input.info().gettype() isHtml = contentType == 'text/html' #print contentType if isHtml: parser.setBaseUrl(input.geturl()) else: parsed = urlparse.urlparse(link) hp = parsed[1] h = hp p = None if hp.find(':') != -1: h, p = hp.split(':', 1) path = parsed[2] path = path.split('/') file = os.path.join(parentDir, h, p) for c in path: if c == '': continue file = os.path.join(file, c) try: self.log.debug('Creating %s' % file) dir, tail = os.path.split(file) if not os.path.exists(dir): os.makedirs(dir) except: self.log.debug(get_exception_string()) out = open(file, 'w') bufSz = 8192 buf = input.read(bufSz) while len(buf) > 0: if isHtml: # Feed the file into the HTML parser parser.feed(buf) if out: out.write(buf) buf = input.read(bufSz) input.close() if out: out.close() # Search the retfile here # Get the next link in level traversal order link = parser.getNextLink() parser.close() def _finalize(self): try: # FIXME: get dir from config dir = 'HOD-log-P%d' % (os.getpid()) dir = os.path.join('.', dir) except: self.log.debug(get_exception_string()) self.np.finalize() def handleIdleJobTracker(self): self.log.critical("Detected idle job tracker for %s seconds. The allocation will be cleaned up." \ % self.cfg['ringmaster']['idleness-limit']) self.__idlenessDetected = True def cd_to_tempdir(self): dir = self.__get_tempdir() if not os.path.exists(dir): os.makedirs(dir) os.chdir(dir) return dir def getWorkload(self): return self.workload def getHostName(self): return self.__hostname def start(self): """run the thread main loop""" self.log.debug("Entered start method.") hodring = os.path.join(self.cfg['ringmaster']['base-dir'], 'bin', 'hodring') largs = [hodring] targs = self.cfg.get_args(section='hodring') largs.extend(targs) hodringCmd = "" for item in largs: hodringCmd = "%s%s " % (hodringCmd, item) self.log.debug(hodringCmd) if self.np.runWorkers(largs) > 0: self.log.critical("Failed to start worker.") self.log.debug("Returned from runWorkers.") self._finalize() def __findExitCode(self): """Determine the exit code based on the status of the cluster or jobs run on them""" xmlrpcServer = ringMasterServer.instance.logMasterSources if xmlrpcServer.getServiceAddr('hdfs') == 'not found' or \ xmlrpcServer.getServiceAddr('hdfs').startswith("Error: "): self.__exitCode = 7 elif xmlrpcServer.getServiceAddr('mapred') == 'not found' or \ xmlrpcServer.getServiceAddr('mapred').startswith("Error: "): self.__exitCode = 8 else: clusterStatus = get_cluster_status( xmlrpcServer.getServiceAddr('hdfs'), xmlrpcServer.getServiceAddr('mapred')) if clusterStatus != 0: self.__exitCode = clusterStatus else: self.__exitCode = self.__findHadoopJobsExitCode() self.log.debug('exit code %s' % self.__exitCode) def __findHadoopJobsExitCode(self): """Determine the consolidate exit code of hadoop jobs run on this cluster, provided this information is available. Return 0 otherwise""" ret = 0 failureStatus = 3 failureCount = 0 if self.__jtMonitor: jobStatusList = self.__jtMonitor.getJobsStatus() try: if len(jobStatusList) > 0: for jobStatus in jobStatusList: self.log.debug( 'job status for %s: %s' % (jobStatus.getJobId(), jobStatus.getStatus())) if jobStatus.getStatus() == failureStatus: failureCount = failureCount + 1 if failureCount > 0: if failureCount == len(jobStatusList): # all jobs failed ret = 16 else: ret = 17 except: self.log.debug('exception in finding hadoop jobs exit code' % get_exception_string()) return ret def stop(self): self.log.debug("RingMaster stop method invoked.") if self.__stopInProgress or self.__isStopped: return self.__stopInProgress = True if ringMasterServer.instance is not None: self.log.debug('finding exit code') self.__findExitCode() self.log.debug('stopping ringmaster instance') ringMasterServer.stopService() else: self.__exitCode = 6 if self.__jtMonitor is not None: self.__jtMonitor.stop() if self.httpServer: self.httpServer.stop() self.__clean_up() self.__isStopped = True def shouldStop(self): """Indicates whether the main loop should exit, either due to idleness condition, or a stop signal was received""" return self.__idlenessDetected or self.__isStopped def getExitCode(self): """return the exit code of the program""" return self.__exitCode
class RingMaster: def __init__(self, cfg, log, **kwds): """starts nodepool and services""" self.download = False self.httpServer = None self.cfg = cfg self.log = log self.__hostname = local_fqdn() self.workDirs = None # ref to the idle job tracker object. self.__jtMonitor = None self.__idlenessDetected = False self.__stopInProgress = False self.__isStopped = False # to let main exit self.__exitCode = 0 # exit code with which the ringmaster main method should return self.workers_per_ring = self.cfg["ringmaster"]["workers_per_ring"] self.__initialize_signal_handlers() sdd = self.cfg["servicedesc"] gsvc = None for key in sdd: gsvc = sdd[key] break npd = self.cfg["nodepooldesc"] self.np = NodePoolUtil.getNodePool(npd, cfg, log) self.log.debug("Getting service ID.") self.serviceId = self.np.getServiceId() self.log.debug("Got service ID: %s" % self.serviceId) self.tarSrcLoc = None if self.cfg["ringmaster"].has_key("hadoop-tar-ball"): self.download = True self.tarSrcLoc = self.cfg["ringmaster"]["hadoop-tar-ball"] self.cd_to_tempdir() if self.download: self.__copy_tarball(os.getcwd()) self.basename = self.__find_tarball_in_dir(os.getcwd()) if self.basename is None: raise Exception("Did not find tarball copied from %s in %s." % (self.tarSrcLoc, os.getcwd())) self.serviceAddr = to_http_url(self.cfg["ringmaster"]["svcrgy-addr"]) self.log.debug("Service registry @ %s" % self.serviceAddr) self.serviceClient = hodXRClient(self.serviceAddr) self.serviceDict = {} try: sdl = self.cfg["servicedesc"] workDirs = self.getWorkDirs(cfg) hdfsDesc = sdl["hdfs"] hdfs = None # Determine hadoop Version hadoopVers = hadoopVersion(self.__getHadoopDir(), self.cfg["hodring"]["java-home"], self.log) if (hadoopVers["major"] == None) or (hadoopVers["minor"] == None): raise Exception( "Could not retrive the version of Hadoop." + " Check the Hadoop installation or the value of the hodring.java-home variable." ) if hdfsDesc.isExternal(): hdfs = HdfsExternal(hdfsDesc, workDirs, version=int(hadoopVers["minor"])) hdfs.setMasterParams(self.cfg["gridservice-hdfs"]) else: hdfs = Hdfs( hdfsDesc, workDirs, 0, version=int(hadoopVers["minor"]), workers_per_ring=self.workers_per_ring ) self.serviceDict[hdfs.getName()] = hdfs mrDesc = sdl["mapred"] mr = None if mrDesc.isExternal(): mr = MapReduceExternal(mrDesc, workDirs, version=int(hadoopVers["minor"])) mr.setMasterParams(self.cfg["gridservice-mapred"]) else: mr = MapReduce( mrDesc, workDirs, 1, version=int(hadoopVers["minor"]), workers_per_ring=self.workers_per_ring ) self.serviceDict[mr.getName()] = mr except: self.log.critical( "Exception in creating Hdfs and Map/Reduce descriptor objects: \ %s." % get_exception_error_string() ) self.log.debug(get_exception_string()) raise # should not be starting these in a constructor ringMasterServer.startService(self.serviceDict, cfg, self.np, log, self) self.rpcserver = ringMasterServer.getAddress() self.httpAddress = None self.tarAddress = None hostname = socket.gethostname() if self.download: self.httpServer = threadedHTTPServer(hostname, self.cfg["ringmaster"]["http-port-range"]) self.httpServer.serve_forever() self.httpAddress = "http://%s:%d/" % (self.httpServer.server_address[0], self.httpServer.server_address[1]) self.tarAddress = "%s%s" % (self.httpAddress, self.basename) ringMasterServer.instance.logMasterSources.registerTarSource(hostname, self.tarAddress) else: self.log.debug("Download not set.") self.log.debug( "%s %s %s %s %s" % (self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod") ) if self.cfg["ringmaster"]["register"]: if self.httpAddress: self.serviceClient.registerService( self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod", {"xrs": self.rpcserver, "http": self.httpAddress}, ) else: self.serviceClient.registerService( self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod", {"xrs": self.rpcserver}, ) self.log.debug("Registered with serivce registry: %s." % self.serviceAddr) hodRingPath = os.path.join(cfg["ringmaster"]["base-dir"], "bin", "hodring") hodRingWorkDir = os.path.join(cfg["hodring"]["temp-dir"], "hodring" + "_" + getpass.getuser()) self.cfg["hodring"]["hodring"] = [hodRingWorkDir] self.cfg["hodring"]["svcrgy-addr"] = self.cfg["ringmaster"]["svcrgy-addr"] self.cfg["hodring"]["service-id"] = self.np.getServiceId() self.cfg["hodring"]["ringmaster-xrs-addr"] = self.__url_to_addr(self.rpcserver) if self.tarSrcLoc != None: cfg["hodring"]["download-addr"] = self.tarAddress self.__init_job_tracker_monitor(ringMasterServer.instance.logMasterSources) def __init_job_tracker_monitor(self, logMasterSources): hadoopDir = self.__getHadoopDir() self.log.debug("hadoopdir=%s, java-home=%s" % (hadoopDir, self.cfg["hodring"]["java-home"])) try: self.__jtMonitor = JobTrackerMonitor( self.log, self, self.cfg["ringmaster"]["jt-poll-interval"], self.cfg["ringmaster"]["idleness-limit"], hadoopDir, self.cfg["hodring"]["java-home"], logMasterSources, ) self.log.debug("starting jt monitor") self.__jtMonitor.start() except: self.log.critical( "Exception in running idle job tracker. This cluster cannot be deallocated if idle.\ Exception message: %s" % get_exception_error_string() ) self.log.debug("Exception details: %s" % get_exception_string()) def __getHadoopDir(self): hadoopDir = None if self.cfg["ringmaster"].has_key("hadoop-tar-ball"): tarFile = os.path.join(os.getcwd(), self.basename) ret = untar(tarFile, os.getcwd()) if not ret: raise Exception( "Untarring tarfile %s to directory %s failed. Cannot find hadoop directory." % (tarFile, os.getcwd()) ) hadoopDir = os.path.join(os.getcwd(), self.__get_dir(tarFile)) else: hadoopDir = self.cfg["gridservice-mapred"]["pkgs"] self.log.debug("Returning Hadoop directory as: %s" % hadoopDir) return hadoopDir def __get_dir(self, name): """Return the root directory inside the tarball specified by name. Assumes that the tarball begins with a root directory.""" import tarfile myTarFile = tarfile.open(name) hadoopPackage = myTarFile.getnames()[0] self.log.debug("tarball name : %s hadoop package name : %s" % (name, hadoopPackage)) return hadoopPackage def __find_tarball_in_dir(self, dir): """Find the tarball among files specified in the given directory. We need this method because how the tarball source URI is given depends on the method of copy and we can't get the tarball name from that. This method will fail if there are multiple tarballs in the directory with the same suffix.""" files = os.listdir(dir) for file in files: if self.tarSrcLoc.endswith(file): return file return None def __copy_tarball(self, destDir): """Copy the hadoop tar ball from a remote location to the specified destination directory. Based on the URL it executes an appropriate copy command. Throws an exception if the command returns a non-zero exit code.""" # for backwards compatibility, treat the default case as file:// url = "" if self.tarSrcLoc.startswith("/"): url = "file:/" src = "%s%s" % (url, self.tarSrcLoc) if src.startswith("file://"): src = src[len("file://") - 1 :] cpCmd = "/bin/cp" cmd = "%s %s %s" % (cpCmd, src, destDir) self.log.debug("Command to execute: %s" % cmd) copyProc = simpleCommand("remote copy", cmd) copyProc.start() copyProc.wait() copyProc.join() ret = copyProc.exit_code() self.log.debug("Completed command execution. Exit Code: %s." % ret) if ret != 0: output = copyProc.output() raise Exception( "Could not copy tarball using command %s. Exit code: %s. Output: %s" % (cmd, ret, output) ) else: raise Exception("Unsupported URL for file: %s" % src) # input: http://hostname:port/. output: [hostname,port] def __url_to_addr(self, url): addr = url.rstrip("/") if addr.startswith("http://"): addr = addr.replace("http://", "", 1) addr_parts = addr.split(":") return [addr_parts[0], int(addr_parts[1])] def __initialize_signal_handlers(self): def sigStop(sigNum, handler): sig_wrapper(sigNum, self.stop) signal.signal(signal.SIGTERM, sigStop) signal.signal(signal.SIGINT, sigStop) signal.signal(signal.SIGQUIT, sigStop) def __clean_up(self): tempDir = self.__get_tempdir() os.chdir(os.path.split(tempDir)[0]) if os.path.exists(tempDir): shutil.rmtree(tempDir, True) self.log.debug("Cleaned up temporary dir: %s" % tempDir) def __get_tempdir(self): dir = os.path.join( self.cfg["ringmaster"]["temp-dir"], "%s.%s.ringmaster" % (self.cfg["ringmaster"]["userid"], self.np.getServiceId()), ) return dir def getWorkDirs(self, cfg, reUse=False): if (not reUse) or (self.workDirs == None): import math frand = random.random() while math.ceil(frand) != math.floor(frand): frand = frand * 100 irand = int(frand) uniq = "%s-%d-%s" % (socket.gethostname(), os.getpid(), irand) dirs = [] parentDirs = cfg["ringmaster"]["work-dirs"] for p in parentDirs: dir = os.path.join(p, uniq) dirs.append(dir) self.workDirs = dirs return self.workDirs def _fetchLink(self, link, parentDir): parser = miniHTMLParser() self.log.debug("Checking link %s" % link) while link: # Get the file from the site and link input = urllib.urlopen(link) out = None contentType = input.info().gettype() isHtml = contentType == "text/html" # print contentType if isHtml: parser.setBaseUrl(input.geturl()) else: parsed = urlparse.urlparse(link) hp = parsed[1] h = hp p = None if hp.find(":") != -1: h, p = hp.split(":", 1) path = parsed[2] path = path.split("/") file = os.path.join(parentDir, h, p) for c in path: if c == "": continue file = os.path.join(file, c) try: self.log.debug("Creating %s" % file) dir, tail = os.path.split(file) if not os.path.exists(dir): os.makedirs(dir) except: self.log.debug(get_exception_string()) out = open(file, "w") bufSz = 8192 buf = input.read(bufSz) while len(buf) > 0: if isHtml: # Feed the file into the HTML parser parser.feed(buf) if out: out.write(buf) buf = input.read(bufSz) input.close() if out: out.close() # Search the retfile here # Get the next link in level traversal order link = parser.getNextLink() parser.close() def _finalize(self): try: # FIXME: get dir from config dir = "HOD-log-P%d" % (os.getpid()) dir = os.path.join(".", dir) except: self.log.debug(get_exception_string()) self.np.finalize() def handleIdleJobTracker(self): self.log.critical( "Detected idle job tracker for %s seconds. The allocation will be cleaned up." % self.cfg["ringmaster"]["idleness-limit"] ) self.__idlenessDetected = True def cd_to_tempdir(self): dir = self.__get_tempdir() if not os.path.exists(dir): os.makedirs(dir) os.chdir(dir) return dir def getWorkload(self): return self.workload def getHostName(self): return self.__hostname def start(self): """run the thread main loop""" self.log.debug("Entered start method.") hodring = os.path.join(self.cfg["ringmaster"]["base-dir"], "bin", "hodring") largs = [hodring] targs = self.cfg.get_args(section="hodring") largs.extend(targs) hodringCmd = "" for item in largs: hodringCmd = "%s%s " % (hodringCmd, item) self.log.debug(hodringCmd) if self.np.runWorkers(largs) > 0: self.log.critical("Failed to start worker.") self.log.debug("Returned from runWorkers.") self._finalize() def __findExitCode(self): """Determine the exit code based on the status of the cluster or jobs run on them""" xmlrpcServer = ringMasterServer.instance.logMasterSources if xmlrpcServer.getServiceAddr("hdfs") == "not found" or xmlrpcServer.getServiceAddr("hdfs").startswith( "Error: " ): self.__exitCode = 7 elif xmlrpcServer.getServiceAddr("mapred") == "not found" or xmlrpcServer.getServiceAddr("mapred").startswith( "Error: " ): self.__exitCode = 8 else: clusterStatus = get_cluster_status( xmlrpcServer.getServiceAddr("hdfs"), xmlrpcServer.getServiceAddr("mapred") ) if clusterStatus != 0: self.__exitCode = clusterStatus else: self.__exitCode = self.__findHadoopJobsExitCode() self.log.debug("exit code %s" % self.__exitCode) def __findHadoopJobsExitCode(self): """Determine the consolidate exit code of hadoop jobs run on this cluster, provided this information is available. Return 0 otherwise""" ret = 0 failureStatus = 3 failureCount = 0 if self.__jtMonitor: jobStatusList = self.__jtMonitor.getJobsStatus() try: if len(jobStatusList) > 0: for jobStatus in jobStatusList: self.log.debug("job status for %s: %s" % (jobStatus.getJobId(), jobStatus.getStatus())) if jobStatus.getStatus() == failureStatus: failureCount = failureCount + 1 if failureCount > 0: if failureCount == len(jobStatusList): # all jobs failed ret = 16 else: ret = 17 except: self.log.debug("exception in finding hadoop jobs exit code" % get_exception_string()) return ret def stop(self): self.log.debug("RingMaster stop method invoked.") if self.__stopInProgress or self.__isStopped: return self.__stopInProgress = True if ringMasterServer.instance is not None: self.log.debug("finding exit code") self.__findExitCode() self.log.debug("stopping ringmaster instance") ringMasterServer.stopService() else: self.__exitCode = 6 if self.__jtMonitor is not None: self.__jtMonitor.stop() if self.httpServer: self.httpServer.stop() self.__clean_up() self.__isStopped = True def shouldStop(self): """Indicates whether the main loop should exit, either due to idleness condition, or a stop signal was received""" return self.__idlenessDetected or self.__isStopped def getExitCode(self): """return the exit code of the program""" return self.__exitCode