def __request(self, methodname, params): response = None retryWaitTime = 5 + random.randint(0, 5) for i in range(0, 30): signal.alarm(self.__timeOut) try: response = self._ServerProxy__request(methodname, params) signal.alarm(0) break except Exception: if self.__retryRequests: if hodInterrupt.isSet(): raise HodInterruptException() time.sleep(retryWaitTime) else: raise Exception("hodXRClientTimeout") return response
def _op_allocate(self, args): operation = "allocate" argLength = len(args) min = 0 max = 0 errorFlag = False errorMsgs = [] if argLength == 3: nodes = args[2] clusterDir = self.__norm_cluster_dir(args[1]) if not os.path.exists(clusterDir): try: os.makedirs(clusterDir) except OSError, err: errorFlag = True errorMsgs.append("Could not create cluster directory. %s" \ % (str(err))) elif not os.path.isdir(clusterDir): errorFlag = True errorMsgs.append( \ "Invalid cluster directory (--hod.clusterdir or -d) : " + \ clusterDir + " : Not a directory") if int(nodes) < 3 : errorFlag = True errorMsgs.append("Invalid nodecount (--hod.nodecount or -n) : " + \ "Must be >= 3. Given nodes: %s" % nodes) if errorFlag: for msg in errorMsgs: self.__log.critical(msg) self.__opCode = 3 return if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, \ (os.R_OK, os.W_OK)): self.__log.critical(INVALID_STATE_FILE_MSGS[2] % \ self.__userState.get_state_file()) self.__opCode = 1 return clusterList = self.__userState.read(CLUSTER_DATA_FILE) if clusterDir in clusterList.keys(): self.__setup_cluster_state(clusterDir) clusterInfo = self.__clusterState.read() # Check if the job is not running. Only then can we safely # allocate another cluster. Otherwise the user would need # to deallocate and free up resources himself. if clusterInfo.has_key('jobid') and \ self.__cluster.is_cluster_deallocated(clusterInfo['jobid']): self.__log.warn("Found a dead cluster at cluster directory '%s'. Deallocating it to allocate a new one." % (clusterDir)) self.__remove_cluster(clusterDir) self.__clusterState.clear() else: self.__log.critical("Found a previously allocated cluster at cluster directory '%s'. HOD cannot determine if this cluster can be automatically deallocated. Deallocate the cluster if it is unused." % (clusterDir)) self.__opCode = 12 return self.__setup_cluster_logger(clusterDir) (status, message) = self.__cluster.is_valid_account() if status is not 0: if message: for line in message: self.__log.critical("verify-account output: %s" % line) self.__log.critical("Cluster cannot be allocated because account verification failed. " \ + "verify-account returned exit code: %s." % status) self.__opCode = 4 return else: self.__log.debug("verify-account returned zero exit code.") if message: self.__log.debug("verify-account output: %s" % message) if re.match('\d+-\d+', nodes): (min, max) = nodes.split("-") min = int(min) max = int(max) else: try: nodes = int(nodes) min = nodes max = nodes except ValueError: print self.__hodhelp.help(operation) self.__log.critical( "%s operation requires a pos_int value for n(nodecount)." % operation) self.__opCode = 3 else: self.__setup_cluster_state(clusterDir) clusterInfo = self.__clusterState.read() self.__opCode = self.__cluster.check_cluster(clusterInfo) if self.__opCode == 0 or self.__opCode == 15: self.__setup_service_registry() if hodInterrupt.isSet(): self.__cleanup() raise HodInterruptException() self.__log.debug("Service Registry started.") self.__adjustMasterFailureCountConfig(nodes) try: allocateStatus = self.__cluster.allocate(clusterDir, min, max) except HodInterruptException, h: self.__cleanup() raise h # Allocation has gone through. # Don't care about interrupts any more try: if allocateStatus == 0: self.__set_cluster_state_info(os.environ, self.__cluster.hdfsInfo, self.__cluster.mapredInfo, self.__cluster.ringmasterXRS, self.__cluster.jobId, min, max) self.__setup_cluster_state(clusterDir) self.__clusterState.write(self.__cluster.jobId, self.__clusterStateInfo) # Do we need to check for interrupts here ?? self.__set_user_state_info( { clusterDir : self.__cluster.jobId, } ) self.__opCode = allocateStatus except Exception, e: # Some unknown problem. self.__cleanup() self.__cluster.deallocate(clusterDir, self.__clusterStateInfo) self.__opCode = 1 raise Exception(e) elif self.__opCode == 12: self.__log.critical("Cluster %s already allocated." % clusterDir)
errorMsgs.append("Invalid nodecount (--hod.nodecount or -n) : " + \ "Must be >= 3. Given nodes: %s" % nodes) if errorFlag: for msg in errorMsgs: self.__log.critical(msg) self.handle_script_exit_code(scriptRet, clusterDir) sys.exit(3) try: self._op_allocate(('allocate', clusterDir, str(nodes))) if self.__opCode == 0: if self.__cfg['hod'].has_key('script-wait-time'): time.sleep(self.__cfg['hod']['script-wait-time']) self.__log.debug('Slept for %d time. Now going to run the script' % self.__cfg['hod']['script-wait-time']) if hodInterrupt.isSet(): self.__log.debug('Hod interrupted - not executing script') else: scriptRunner = hadoopScript(clusterDir, self.__cfg['hod']['original-dir']) self.__opCode = scriptRunner.run(script) scriptRet = self.__opCode self.__log.info("Exit code from running the script: %d" % self.__opCode) else: self.__log.critical("Error %d in allocating the cluster. Cannot run the script." % self.__opCode) if hodInterrupt.isSet(): # Got interrupt while executing script. Unsetting it for deallocating hodInterrupt.setFlag(False) if self._is_cluster_allocated(clusterDir): self._op_deallocate(('deallocate', clusterDir))
def _op_allocate(self, args): operation = "allocate" argLength = len(args) min = 0 max = 0 errorFlag = False errorMsgs = [] if argLength == 3: nodes = args[2] clusterDir = self.__norm_cluster_dir(args[1]) if not os.path.exists(clusterDir): try: os.makedirs(clusterDir) except OSError, err: errorFlag = True errorMsgs.append("Could not create cluster directory. %s" \ % (str(err))) elif not os.path.isdir(clusterDir): errorFlag = True errorMsgs.append( \ "Invalid cluster directory (--hod.clusterdir or -d) : " + \ clusterDir + " : Not a directory") if int(nodes) < 3: errorFlag = True errorMsgs.append("Invalid nodecount (--hod.nodecount or -n) : " + \ "Must be >= 3. Given nodes: %s" % nodes) if errorFlag: for msg in errorMsgs: self.__log.critical(msg) self.__opCode = 3 return if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, \ (os.R_OK, os.W_OK)): self.__log.critical(INVALID_STATE_FILE_MSGS[2] % \ self.__userState.get_state_file()) self.__opCode = 1 return clusterList = self.__userState.read(CLUSTER_DATA_FILE) if clusterDir in clusterList.keys(): self.__setup_cluster_state(clusterDir) clusterInfo = self.__clusterState.read() # Check if the job is not running. Only then can we safely # allocate another cluster. Otherwise the user would need # to deallocate and free up resources himself. if clusterInfo.has_key('jobid') and \ self.__cluster.is_cluster_deallocated(clusterInfo['jobid']): self.__log.warn( "Found a dead cluster at cluster directory '%s'. Deallocating it to allocate a new one." % (clusterDir)) self.__remove_cluster(clusterDir) self.__clusterState.clear() else: self.__log.critical( "Found a previously allocated cluster at cluster directory '%s'. HOD cannot determine if this cluster can be automatically deallocated. Deallocate the cluster if it is unused." % (clusterDir)) self.__opCode = 12 return self.__setup_cluster_logger(clusterDir) if re.match('\d+-\d+', nodes): (min, max) = nodes.split("-") min = int(min) max = int(max) else: try: nodes = int(nodes) min = nodes max = nodes except ValueError: print self.__hodhelp.help(operation) self.__log.critical( "%s operation requires a pos_int value for n(nodecount)." % operation) self.__opCode = 3 else: self.__setup_cluster_state(clusterDir) clusterInfo = self.__clusterState.read() self.__opCode = self.__cluster.check_cluster(clusterInfo) if self.__opCode == 0 or self.__opCode == 15: self.__setup_service_registry() if hodInterrupt.isSet(): self.__cleanup() raise HodInterruptException() self.__log.debug("Service Registry started.") self.__adjustMasterFailureCountConfig(nodes) try: allocateStatus = self.__cluster.allocate( clusterDir, min, max) except HodInterruptException, h: self.__cleanup() raise h # Allocation has gone through. # Don't care about interrupts any more try: if allocateStatus == 0: self.__set_cluster_state_info( os.environ, self.__cluster.hdfsInfo, self.__cluster.mapredInfo, self.__cluster.ringmasterXRS, self.__cluster.jobId, min, max) self.__setup_cluster_state(clusterDir) self.__clusterState.write( self.__cluster.jobId, self.__clusterStateInfo) # Do we need to check for interrupts here ?? self.__set_user_state_info({ clusterDir: self.__cluster.jobId, }) self.__opCode = allocateStatus except Exception, e: # Some unknown problem. self.__cleanup() self.__cluster.deallocate(clusterDir, self.__clusterStateInfo) self.__opCode = 1 raise Exception(e) elif self.__opCode == 12: self.__log.critical("Cluster %s already allocated." % clusterDir)
if errorFlag: for msg in errorMsgs: self.__log.critical(msg) self.handle_script_exit_code(scriptRet, clusterDir) sys.exit(3) try: self._op_allocate(('allocate', clusterDir, str(nodes))) if self.__opCode == 0: if self.__cfg['hod'].has_key('script-wait-time'): time.sleep(self.__cfg['hod']['script-wait-time']) self.__log.debug( 'Slept for %d time. Now going to run the script' % self.__cfg['hod']['script-wait-time']) if hodInterrupt.isSet(): self.__log.debug('Hod interrupted - not executing script') else: scriptRunner = hadoopScript( clusterDir, self.__cfg['hod']['original-dir']) self.__opCode = scriptRunner.run(script) scriptRet = self.__opCode self.__log.info("Exit code from running the script: %d" % self.__opCode) else: self.__log.critical( "Error %d in allocating the cluster. Cannot run the script." % self.__opCode) if hodInterrupt.isSet(): # Got interrupt while executing script. Unsetting it for deallocating