Exemplo n.º 1
0
 def _register_service(self, port=None, installSignalHandlers=1):
   if self.__svcrgy:
     self.logs['main'].info(
         "Registering service with service registery %s... " % self.__svcrgy)
     svcrgy = hodXRClient(self.__svcrgy, None, None, 0, 0, installSignalHandlers)
     
     if self._xrc and self._http:
       svcrgy.registerService(self._cfg['userid'], self._serviceID, 
                              self.hostname, self.name, 'hod', {
                              'xrs' : "http://%s:%s" % (
                              self._xrc.server_address[0], 
                              self._xrc.server_address[1]),'http' : 
                              "http://%s:%s" % (self._http.server_address[0], 
                              self._http.server_address[1])})
     elif self._xrc:
       svcrgy.registerService(self._cfg['userid'], self._serviceID, 
                              self.hostname, self.name, 'hod', {
                              'xrs' : "http://%s:%s" % (
                              self._xrc.server_address[0], 
                              self._xrc.server_address[1]),})
     elif self._http:
       svcrgy.registerService(self._cfg['userid'], self._serviceID, 
                              self.hostname, self.name, 'hod', {'http' : 
                              "http://%s:%s" % (self._http.server_address[0], 
                              self._http.server_address[1]),})        
     else:
       svcrgy.registerService(self._cfg['userid'], self._serviceID, 
                              self.hostname, name, 'hod', {} )
Exemplo n.º 2
0
 def testFailure(self):
     """HOD should raise Exception when unregistered rpc is called"""
     global serverPort
     client = hodXRClient('http://localhost:' + str(serverPort),
                          retryRequests=False)
     self.assertRaises(Exception, client.noMethod)
     pass
Exemplo n.º 3
0
 def testTimeout(self):
   """HOD should raise Exception when rpc call times out"""
   # Give client some random nonexistent url
   serverPort = ServiceUtil.getUniqRandomPort(h='localhost',low=40000,high=50000)
   client = hodXRClient('http://localhost:' + str(serverPort), retryRequests=False)
   self.assertRaises(Exception, client.testing)
   pass
Exemplo n.º 4
0
 def testTimeout(self):
     """HOD should raise Exception when rpc call times out"""
     # Give client some random nonexistent url
     serverPort = ServiceUtil.getUniqRandomPort(h='localhost',
                                                low=40000,
                                                high=50000)
     client = hodXRClient('http://localhost:' + str(serverPort),
                          retryRequests=False)
     self.assertRaises(Exception, client.testing)
     pass
Exemplo n.º 5
0
  def testInterrupt(self):
    """ HOD should raise HodInterruptException when interrupted"""

    def interrupt(testClass):
      testClass.assertRaises(HodInterruptException, client.testing)
      
    serverPort = ServiceUtil.getUniqRandomPort(h='localhost',low=40000,high=50000)
    client = hodXRClient('http://localhost:' + str(serverPort))
    myThread = threading.Thread(name='testinterrupt', target=interrupt,args=(self,))
    # Set the global interrupt
    hodInterrupt.setFlag()
    myThread.start()
    myThread.join()
    pass
Exemplo n.º 6
0
    def testInterrupt(self):
        """ HOD should raise HodInterruptException when interrupted"""
        def interrupt(testClass):
            testClass.assertRaises(HodInterruptException, client.testing)

        serverPort = ServiceUtil.getUniqRandomPort(h='localhost',
                                                   low=40000,
                                                   high=50000)
        client = hodXRClient('http://localhost:' + str(serverPort))
        myThread = threading.Thread(name='testinterrupt',
                                    target=interrupt,
                                    args=(self, ))
        # Set the global interrupt
        hodInterrupt.setFlag()
        myThread.start()
        myThread.join()
        pass
Exemplo n.º 7
0
    def _register_service(self, port=None, installSignalHandlers=1):
        if self.__svcrgy:
            self.logs['main'].info(
                "Registering service with service registery %s... " %
                self.__svcrgy)
            svcrgy = hodXRClient(self.__svcrgy, None, None, 0, 0,
                                 installSignalHandlers)

            if self._xrc and self._http:
                svcrgy.registerService(
                    self._cfg['userid'], self._serviceID, self.hostname,
                    self.name, 'hod', {
                        'xrs':
                        "http://%s:%s" % (self._xrc.server_address[0],
                                          self._xrc.server_address[1]),
                        'http':
                        "http://%s:%s" % (self._http.server_address[0],
                                          self._http.server_address[1])
                    })
            elif self._xrc:
                svcrgy.registerService(
                    self._cfg['userid'], self._serviceID, self.hostname,
                    self.name, 'hod', {
                        'xrs':
                        "http://%s:%s" % (self._xrc.server_address[0],
                                          self._xrc.server_address[1]),
                    })
            elif self._http:
                svcrgy.registerService(
                    self._cfg['userid'], self._serviceID, self.hostname,
                    self.name, 'hod', {
                        'http':
                        "http://%s:%s" % (self._http.server_address[0],
                                          self._http.server_address[1]),
                    })
            else:
                svcrgy.registerService(self._cfg['userid'], self._serviceID,
                                       self.hostname, name, 'hod', {})
Exemplo n.º 8
0
    def __init__(self, cfg, log, **kwds):
        """starts nodepool and services"""
        self.download = False
        self.httpServer = None
        self.cfg = cfg
        self.log = log
        self.__hostname = local_fqdn()
        self.workDirs = None

        # ref to the idle job tracker object.
        self.__jtMonitor = None
        self.__idlenessDetected = False
        self.__stopInProgress = False
        self.__isStopped = False  # to let main exit
        self.__exitCode = 0  # exit code with which the ringmaster main method should return

        self.workers_per_ring = self.cfg["ringmaster"]["workers_per_ring"]

        self.__initialize_signal_handlers()

        sdd = self.cfg["servicedesc"]
        gsvc = None
        for key in sdd:
            gsvc = sdd[key]
            break

        npd = self.cfg["nodepooldesc"]
        self.np = NodePoolUtil.getNodePool(npd, cfg, log)

        self.log.debug("Getting service ID.")

        self.serviceId = self.np.getServiceId()

        self.log.debug("Got service ID: %s" % self.serviceId)

        self.tarSrcLoc = None
        if self.cfg["ringmaster"].has_key("hadoop-tar-ball"):
            self.download = True
            self.tarSrcLoc = self.cfg["ringmaster"]["hadoop-tar-ball"]

        self.cd_to_tempdir()

        if self.download:
            self.__copy_tarball(os.getcwd())
            self.basename = self.__find_tarball_in_dir(os.getcwd())
            if self.basename is None:
                raise Exception("Did not find tarball copied from %s in %s." % (self.tarSrcLoc, os.getcwd()))

        self.serviceAddr = to_http_url(self.cfg["ringmaster"]["svcrgy-addr"])

        self.log.debug("Service registry @ %s" % self.serviceAddr)

        self.serviceClient = hodXRClient(self.serviceAddr)
        self.serviceDict = {}
        try:
            sdl = self.cfg["servicedesc"]

            workDirs = self.getWorkDirs(cfg)

            hdfsDesc = sdl["hdfs"]
            hdfs = None

            # Determine hadoop Version
            hadoopVers = hadoopVersion(self.__getHadoopDir(), self.cfg["hodring"]["java-home"], self.log)

            if (hadoopVers["major"] == None) or (hadoopVers["minor"] == None):
                raise Exception(
                    "Could not retrive the version of Hadoop."
                    + " Check the Hadoop installation or the value of the hodring.java-home variable."
                )
            if hdfsDesc.isExternal():
                hdfs = HdfsExternal(hdfsDesc, workDirs, version=int(hadoopVers["minor"]))
                hdfs.setMasterParams(self.cfg["gridservice-hdfs"])
            else:
                hdfs = Hdfs(
                    hdfsDesc, workDirs, 0, version=int(hadoopVers["minor"]), workers_per_ring=self.workers_per_ring
                )

            self.serviceDict[hdfs.getName()] = hdfs

            mrDesc = sdl["mapred"]
            mr = None
            if mrDesc.isExternal():
                mr = MapReduceExternal(mrDesc, workDirs, version=int(hadoopVers["minor"]))
                mr.setMasterParams(self.cfg["gridservice-mapred"])
            else:
                mr = MapReduce(
                    mrDesc, workDirs, 1, version=int(hadoopVers["minor"]), workers_per_ring=self.workers_per_ring
                )

            self.serviceDict[mr.getName()] = mr
        except:
            self.log.critical(
                "Exception in creating Hdfs and Map/Reduce descriptor objects: \
                            %s."
                % get_exception_error_string()
            )
            self.log.debug(get_exception_string())
            raise

        # should not be starting these in a constructor
        ringMasterServer.startService(self.serviceDict, cfg, self.np, log, self)

        self.rpcserver = ringMasterServer.getAddress()

        self.httpAddress = None
        self.tarAddress = None
        hostname = socket.gethostname()
        if self.download:
            self.httpServer = threadedHTTPServer(hostname, self.cfg["ringmaster"]["http-port-range"])

            self.httpServer.serve_forever()
            self.httpAddress = "http://%s:%d/" % (self.httpServer.server_address[0], self.httpServer.server_address[1])
            self.tarAddress = "%s%s" % (self.httpAddress, self.basename)

            ringMasterServer.instance.logMasterSources.registerTarSource(hostname, self.tarAddress)
        else:
            self.log.debug("Download not set.")

        self.log.debug(
            "%s %s %s %s %s" % (self.cfg["ringmaster"]["userid"], self.serviceId, self.__hostname, "ringmaster", "hod")
        )

        if self.cfg["ringmaster"]["register"]:
            if self.httpAddress:
                self.serviceClient.registerService(
                    self.cfg["ringmaster"]["userid"],
                    self.serviceId,
                    self.__hostname,
                    "ringmaster",
                    "hod",
                    {"xrs": self.rpcserver, "http": self.httpAddress},
                )
            else:
                self.serviceClient.registerService(
                    self.cfg["ringmaster"]["userid"],
                    self.serviceId,
                    self.__hostname,
                    "ringmaster",
                    "hod",
                    {"xrs": self.rpcserver},
                )

        self.log.debug("Registered with serivce registry: %s." % self.serviceAddr)

        hodRingPath = os.path.join(cfg["ringmaster"]["base-dir"], "bin", "hodring")
        hodRingWorkDir = os.path.join(cfg["hodring"]["temp-dir"], "hodring" + "_" + getpass.getuser())

        self.cfg["hodring"]["hodring"] = [hodRingWorkDir]
        self.cfg["hodring"]["svcrgy-addr"] = self.cfg["ringmaster"]["svcrgy-addr"]
        self.cfg["hodring"]["service-id"] = self.np.getServiceId()

        self.cfg["hodring"]["ringmaster-xrs-addr"] = self.__url_to_addr(self.rpcserver)

        if self.tarSrcLoc != None:
            cfg["hodring"]["download-addr"] = self.tarAddress

        self.__init_job_tracker_monitor(ringMasterServer.instance.logMasterSources)
Exemplo n.º 9
0
 def testFailure(self):
   """HOD should raise Exception when unregistered rpc is called"""
   global serverPort
   client = hodXRClient('http://localhost:' + str(serverPort), retryRequests=False)
   self.assertRaises(Exception, client.noMethod)
   pass
Exemplo n.º 10
0
  def start(self):
    """Run and maintain hodring commands"""
    
    try:
      if self._cfg.has_key('download-addr'):
        self._http = threadedHTTPServer('', self._cfg['http-port-range'])
        self.log.info("Starting http server...")
        self._http.serve_forever()
        self.log.debug("http://%s:%d" % (self._http.server_address[0],
                     self._http.server_address[1]))
      
      hodBaseService.start(self)
      
      ringXRAddress = None
      if self._cfg.has_key('ringmaster-xrs-addr'):
        ringXRAddress = "http://%s:%s/" % (self._cfg['ringmaster-xrs-addr'][0],
                          self._cfg['ringmaster-xrs-addr'][1])
        self.log.debug("Ringmaster at %s" % ringXRAddress)

      self.log.debug("Creating service registry XML-RPC client.")
      serviceClient = hodXRClient(to_http_url(
                                  self._cfg['svcrgy-addr']))
      if ringXRAddress == None:
        self.log.info("Did not get ringmaster XML-RPC address. Fetching information from service registry.")
        ringList = serviceClient.getServiceInfo(self._cfg['userid'], 
            self._cfg['service-id'], 'ringmaster', 'hod')
      
        self.log.debug(pprint.pformat(ringList))
      
        if len(ringList):
          if isinstance(ringList, list):
            ringXRAddress = ringList[0]['xrs']
      
        count = 0
        while (ringXRAddress == None and count < 3000):
          ringList = serviceClient.getServiceInfo(self._cfg['userid'], 
            self._cfg['service-id'], 'ringmaster', 'hod')
        
          if len(ringList):
            if isinstance(ringList, list):
              ringXRAddress = ringList[0]['xrs']
        
          count = count + 1
          time.sleep(.2)
      
      if ringXRAddress == None:
        raise Exception("Could not get ringmaster XML-RPC server address.")
        
      self.log.debug("Creating ringmaster XML-RPC client.")
      ringClient = hodXRClient(ringXRAddress)    
      
      id = self.hostname + "_" + str(os.getpid())
      
      if 'download-addr' in self._cfg:
        self.__download_package(ringClient)
      else:
        self.log.debug("Did not find a download address.")
          
      cmdlist = []
      firstTime = True
      increment = 0
      hadoopStartupTime = 2
       
      cmdlist = ringClient.getCommand(id)

      while (cmdlist == []):
        if firstTime:
          sleepTime = increment + self._cfg['cmd-retry-initial-time'] + hadoopStartupTime\
                        + random.uniform(0,self._cfg['cmd-retry-interval'])
          firstTime = False
        else:
          sleepTime = increment + self._cfg['cmd-retry-initial-time'] + \
                        + random.uniform(0,self._cfg['cmd-retry-interval'])
        self.log.debug("Did not get command list. Waiting for %s seconds." % (sleepTime))
        time.sleep(sleepTime)
        increment = increment + 1
        cmdlist = ringClient.getCommand(id)

      self.log.debug(pformat(cmdlist)) 
      cmdDescs = []
      for cmds in cmdlist:
        cmdDescs.append(CommandDesc(cmds['dict'], self.log))
  
      self._cfg['commanddesc'] = cmdDescs
      
      self.log.info("Running hadoop commands...")

      self.__run_hadoop_commands(False)
        
      masterParams = []
      for k, cmd in self.__running.iteritems():
        masterParams.extend(cmd.filledInKeyVals)
  
      self.log.debug("printing getparams")
      self.log.debug(pformat(id))
      self.log.debug(pformat(masterParams))
      # when this is on a required host, the ringMaster already has our masterParams
      if(len(masterParams) > 0):
        ringClient.addMasterParams(id, masterParams)
    except Exception, e:
      raise Exception(e)
Exemplo n.º 11
0
  def allocate(self, clusterDir, min, max=None):
    status = 0
    failureCount = 0
    self.__svcrgyClient = self.__get_svcrgy_client()
        
    self.__log.debug("allocate %s %s %s" % (clusterDir, min, max))
    
    if min < 3:
      self.__log.critical("Minimum nodes must be greater than 2.")
      status = 2
    else:
      nodeSet = self.__nodePool.newNodeSet(min)
      walltime = None
      if self.__cfg['hod'].has_key('walltime'):
        walltime = self.__cfg['hod']['walltime']
      self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)
      # if the job submission returned an error other than no resources
      # retry a couple of times
      while (self.jobId is False) and (exitCode != 188):
        if hodInterrupt.isSet():
          raise HodInterruptException()

        failureCount += 1
        if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']):
          self.__log.debug("failed submitting job more than the retries. exiting")
          break
        else:
          # wait a bit before retrying
          time.sleep(self.__cfg['hod']['job-command-failure-interval'])
          if hodInterrupt.isSet():
            raise HodInterruptException()
          self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)

      if self.jobId:
        jobStatus = None
        try:
          jobStatus = self.__check_job_status()
        except HodInterruptException, h:
          self.__log.info(HOD_INTERRUPTED_MESG)
          self.delete_job(self.jobId)
          self.__log.info("Cluster %s removed from queue." % self.jobId)
          raise h
        else:
          if jobStatus == -1:
            self.delete_job(self.jobId);
            status = 4
            return status

        if jobStatus:
          self.__log.info("Cluster Id %s" \
                                                              % self.jobId)
          try:
            self.ringmasterXRS = self.__get_ringmaster_client()
            
            self.__log.debug("Ringmaster at : %s" % self.ringmasterXRS )
            ringClient = None
            if self.ringmasterXRS:
              ringClient =  hodXRClient(self.ringmasterXRS)
                
              hdfsStatus, hdfsAddr, self.hdfsInfo = \
                self.__init_hadoop_service('hdfs', ringClient)
                
              if hdfsStatus:
                self.__log.info("HDFS UI at http://%s" % self.hdfsInfo)
  
                mapredStatus, mapredAddr, self.mapredInfo = \
                  self.__init_hadoop_service('mapred', ringClient)
  
                if mapredStatus:
                  self.__log.info("Mapred UI at http://%s" % self.mapredInfo)
  
                  if self.__cfg['hod'].has_key('update-worker-info') \
                    and self.__cfg['hod']['update-worker-info']:
                    workerInfoMap = {}
                    workerInfoMap['HDFS UI'] = 'http://%s' % self.hdfsInfo
                    workerInfoMap['Mapred UI'] = 'http://%s' % self.mapredInfo
                    # Ringmaster URL sample format : http://hostname:port/
                    workerInfoMap['RM RPC Port'] = '%s' % self.ringmasterXRS.split(":")[2].strip("/")
                    if mapredAddr.find(':') != -1:
                      workerInfoMap['Mapred RPC Port'] = mapredAddr.split(':')[1]
                    ret = self.__nodePool.updateWorkerInfo(workerInfoMap, self.jobId)
                    if ret != 0:
                      self.__log.warn('Could not update HDFS and Mapred information.' \
                                      'User Portal may not show relevant information.' \
                                      'Error code=%s' % ret)
  
                  self.__cfg.replace_escape_seqs()
                    
                  # Go generate the client side hadoop-site.xml now
                  # adding final-params as well, just so that conf on 
                  # client-side and server-side are (almost) the same
                  clientParams = None
                  serverParams = {}
                  finalServerParams = {}
  
                  # client-params
                  if self.__cfg['hod'].has_key('client-params'):
                    clientParams = self.__cfg['hod']['client-params']
  
                  # server-params
                  if self.__cfg['gridservice-mapred'].has_key('server-params'):
                    serverParams.update(\
                      self.__cfg['gridservice-mapred']['server-params'])
                  if self.__cfg['gridservice-hdfs'].has_key('server-params'):
                    # note that if there are params in both mapred and hdfs
                    # sections, the ones in hdfs overwirte the ones in mapred
                    serverParams.update(\
                        self.__cfg['gridservice-hdfs']['server-params'])
                    
                  # final-server-params
                  if self.__cfg['gridservice-mapred'].has_key(\
                                                    'final-server-params'):
                    finalServerParams.update(\
                      self.__cfg['gridservice-mapred']['final-server-params'])
                  if self.__cfg['gridservice-hdfs'].has_key(
                                                    'final-server-params'):
                    finalServerParams.update(\
                        self.__cfg['gridservice-hdfs']['final-server-params'])
  
                  clusterFactor = self.__cfg['hod']['cluster-factor']
                  tempDir = self.__cfg['hod']['temp-dir']
                  if not os.path.exists(tempDir):
                    os.makedirs(tempDir)
                  tempDir = os.path.join( tempDir, self.__cfg['hod']['userid']\
                                  + "." + self.jobId )
                  mrSysDir = getMapredSystemDirectory(self.__cfg['hodring']['mapred-system-dir-root'],\
                                      self.__cfg['hod']['userid'], self.jobId)
                  self.__hadoopCfg.gen_site_conf(clusterDir, tempDir, min,\
                            hdfsAddr, mrSysDir, mapredAddr, clientParams,\
                            serverParams, finalServerParams,\
                            clusterFactor)
                  self.__log.info("hadoop-site.xml at %s" % clusterDir)
                  # end of hadoop-site.xml generation
                else:
                  status = 8
              else:
                status = 7  
            else:
              status = 6
            if status != 0:
              self.__log.debug("Cleaning up cluster id %s, as cluster could not be allocated." % self.jobId)
              if ringClient is None:
                self.delete_job(self.jobId)
              else:
                self.__log.debug("Calling rm.stop()")
                ringClient.stopRM()
                self.__log.debug("Returning from rm.stop()")
          except HodInterruptException, h:
            self.__log.info(HOD_INTERRUPTED_MESG)
            if self.ringmasterXRS:
              if ringClient is None:
                ringClient =  hodXRClient(self.ringmasterXRS)
              self.__log.debug("Calling rm.stop()")
              ringClient.stopRM()
              self.__log.debug("Returning from rm.stop()")
              self.__log.info("Cluster Shutdown by informing ringmaster.")
            else:
              self.delete_job(self.jobId)
              self.__log.info("Cluster %s removed from queue directly." % self.jobId)
            raise h
        else:
Exemplo n.º 12
0
      self.__log.info("Exception in collecting Job tracker logs. Ignoring.")
    
    rmAddr = None
    if clusterInfo.has_key('ring'):
      # format is http://host:port/ We need host:port
      rmAddr = clusterInfo['ring'][7:]
      if rmAddr.endswith('/'):
        rmAddr = rmAddr[:-1]

    if (rmAddr is None) or (not self.__isRingMasterAlive(rmAddr)):
      # Cluster is already dead, don't try to contact ringmaster.
      self.__nodePool.finalize()
      status = 10 # As cluster is dead, we just set the status to 'cluster dead'.
    else:
      xrsAddr = clusterInfo['ring']
      rmClient = hodXRClient(xrsAddr)
      self.__log.debug('calling rm.stop')
      rmClient.stopRM()
      self.__log.debug('completed rm.stop')

    # cleanup hod temp dirs
    tempDir = os.path.join( self.__cfg['hod']['temp-dir'], \
                    self.__cfg['hod']['userid'] + "." + clusterInfo['jobid'] )
    if os.path.exists(tempDir):
      shutil.rmtree(tempDir)
   
    return status
  
class hadoopScript:
  def __init__(self, conf, execDir):
    self.__environ = os.environ.copy()
Exemplo n.º 13
0
    def clusterStart(self, initialize=True):
        """Start a stopped mapreduce/dfs cluster"""
        if initialize:
            self.log.debug('clusterStart Method Invoked - Initialize')
        else:
            self.log.debug('clusterStart Method Invoked - No Initialize')
        try:
            self.log.debug("Creating service registry XML-RPC client.")
            serviceClient = hodXRClient(to_http_url(self._cfg['svcrgy-addr']),
                                        None, None, 0, 0, 0)

            self.log.info(
                "Fetching ringmaster information from service registry.")
            count = 0
            ringXRAddress = None
            while (ringXRAddress == None and count < 3000):
                ringList = serviceClient.getServiceInfo(
                    self._cfg['userid'], self._cfg['service-id'], 'ringmaster',
                    'hod')
                if len(ringList):
                    if isinstance(ringList, list):
                        ringXRAddress = ringList[0]['xrs']
                count = count + 1

            if ringXRAddress == None:
                raise Exception(
                    "Could not get ringmaster XML-RPC server address.")

            self.log.debug("Creating ringmaster XML-RPC client.")
            ringClient = hodXRClient(ringXRAddress, None, None, 0, 0, 0)

            id = self.hostname + "_" + str(os.getpid())

            cmdlist = []
            if initialize:
                if 'download-addr' in self._cfg:
                    self.__download_package(ringClient)
                else:
                    self.log.debug("Did not find a download address.")
                while (cmdlist == []):
                    cmdlist = ringClient.getCommand(id)
            else:
                while (cmdlist == []):
                    cmdlist = ringClient.getAdminCommand(id)

            self.log.debug(pformat(cmdlist))
            cmdDescs = []
            for cmds in cmdlist:
                cmdDescs.append(CommandDesc(cmds['dict'], self.log))

            self._cfg['commanddesc'] = cmdDescs

            if initialize:
                self.log.info("Running hadoop commands again... - Initialize")
                self.__run_hadoop_commands()
                masterParams = []
                for k, cmd in self.__running.iteritems():
                    self.log.debug(cmd)
                    masterParams.extend(cmd.filledInKeyVals)

                self.log.debug("printing getparams")
                self.log.debug(pformat(id))
                self.log.debug(pformat(masterParams))
                # when this is on a required host, the ringMaster already has our masterParams
                if (len(masterParams) > 0):
                    ringClient.addMasterParams(id, masterParams)
            else:
                self.log.info(
                    "Running hadoop commands again... - No Initialize")
                self.__run_hadoop_commands()

        except:
            self.log.error(get_exception_string())

        return True
Exemplo n.º 14
0
    def start(self):
        """Run and maintain hodring commands"""

        try:
            if self._cfg.has_key('download-addr'):
                self._http = threadedHTTPServer('',
                                                self._cfg['http-port-range'])
                self.log.info("Starting http server...")
                self._http.serve_forever()
                self.log.debug("http://%s:%d" % (self._http.server_address[0],
                                                 self._http.server_address[1]))

            hodBaseService.start(self)

            ringXRAddress = None
            if self._cfg.has_key('ringmaster-xrs-addr'):
                ringXRAddress = "http://%s:%s/" % (
                    self._cfg['ringmaster-xrs-addr'][0],
                    self._cfg['ringmaster-xrs-addr'][1])
                self.log.debug("Ringmaster at %s" % ringXRAddress)

            self.log.debug("Creating service registry XML-RPC client.")
            serviceClient = hodXRClient(to_http_url(self._cfg['svcrgy-addr']))
            if ringXRAddress == None:
                self.log.info(
                    "Did not get ringmaster XML-RPC address. Fetching information from service registry."
                )
                ringList = serviceClient.getServiceInfo(
                    self._cfg['userid'], self._cfg['service-id'], 'ringmaster',
                    'hod')

                self.log.debug(pprint.pformat(ringList))

                if len(ringList):
                    if isinstance(ringList, list):
                        ringXRAddress = ringList[0]['xrs']

                count = 0
                while (ringXRAddress == None and count < 3000):
                    ringList = serviceClient.getServiceInfo(
                        self._cfg['userid'], self._cfg['service-id'],
                        'ringmaster', 'hod')

                    if len(ringList):
                        if isinstance(ringList, list):
                            ringXRAddress = ringList[0]['xrs']

                    count = count + 1
                    time.sleep(.2)

            if ringXRAddress == None:
                raise Exception(
                    "Could not get ringmaster XML-RPC server address.")

            self.log.debug("Creating ringmaster XML-RPC client.")
            ringClient = hodXRClient(ringXRAddress)

            id = self.hostname + "_" + str(os.getpid())

            if 'download-addr' in self._cfg:
                self.__download_package(ringClient)
            else:
                self.log.debug("Did not find a download address.")

            cmdlist = []
            firstTime = True
            increment = 0
            hadoopStartupTime = 2

            cmdlist = ringClient.getCommand(id)

            while (cmdlist == []):
                if firstTime:
                    sleepTime = increment + self._cfg['cmd-retry-initial-time'] + hadoopStartupTime\
                                  + random.uniform(0,self._cfg['cmd-retry-interval'])
                    firstTime = False
                else:
                    sleepTime = increment + self._cfg['cmd-retry-initial-time'] + \
                                  + random.uniform(0,self._cfg['cmd-retry-interval'])
                self.log.debug(
                    "Did not get command list. Waiting for %s seconds." %
                    (sleepTime))
                time.sleep(sleepTime)
                increment = increment + 1
                cmdlist = ringClient.getCommand(id)

            self.log.debug(pformat(cmdlist))
            cmdDescs = []
            for cmds in cmdlist:
                cmdDescs.append(CommandDesc(cmds['dict'], self.log))

            self._cfg['commanddesc'] = cmdDescs

            self.log.info("Running hadoop commands...")

            self.__run_hadoop_commands(False)

            masterParams = []
            for k, cmd in self.__running.iteritems():
                masterParams.extend(cmd.filledInKeyVals)

            self.log.debug("printing getparams")
            self.log.debug(pformat(id))
            self.log.debug(pformat(masterParams))
            # when this is on a required host, the ringMaster already has our masterParams
            if (len(masterParams) > 0):
                ringClient.addMasterParams(id, masterParams)
        except Exception, e:
            raise Exception(e)
Exemplo n.º 15
0
      self.__log.info("Exception in collecting Job tracker logs. Ignoring.")
    
    rmAddr = None
    if clusterInfo.has_key('ring'):
      # format is http://host:port/ We need host:port
      rmAddr = clusterInfo['ring'][7:]
      if rmAddr.endswith('/'):
        rmAddr = rmAddr[:-1]

    if (rmAddr is None) or (not self.__isRingMasterAlive(rmAddr)):
      # Cluster is already dead, don't try to contact ringmaster.
      self.__nodePool.finalize()
      status = 10 # As cluster is dead, we just set the status to 'cluster dead'.
    else:
      xrsAddr = clusterInfo['ring']
      rmClient = hodXRClient(xrsAddr)
      self.__log.debug('calling rm.stop')
      rmClient.stopRM()
      self.__log.debug('completed rm.stop')

    # cleanup hod temp dirs
    tempDir = os.path.join( self.__cfg['hod']['temp-dir'], \
                    self.__cfg['hod']['userid'] + "." + clusterInfo['jobid'] )
    if os.path.exists(tempDir):
      shutil.rmtree(tempDir)
   
    return status
  
class hadoopScript:
  def __init__(self, conf, execDir):
    self.__environ = os.environ.copy()
Exemplo n.º 16
0
 def __get_svcrgy_client(self):
   svcrgyUrl = to_http_url(self.__cfg['hod']['xrs-address'])
   return hodXRClient(svcrgyUrl)
Exemplo n.º 17
0
 def __get_svcrgy_client(self):
   svcrgyUrl = to_http_url(self.__cfg['hod']['xrs-address'])
   return hodXRClient(svcrgyUrl)
Exemplo n.º 18
0
  def allocate(self, clusterDir, min, max=None):
    status = 0
    failureCount = 0
    self.__svcrgyClient = self.__get_svcrgy_client()
        
    self.__log.debug("allocate %s %s %s" % (clusterDir, min, max))
    
    if min < 3:
      self.__log.critical("Minimum nodes must be greater than 2.")
      status = 2
    else:
      nodeSet = self.__nodePool.newNodeSet(min)
      walltime = None
      if self.__cfg['hod'].has_key('walltime'):
        walltime = self.__cfg['hod']['walltime']
      self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)
      # if the job submission returned an error other than no resources
      # retry a couple of times
      while (self.jobId is False) and (exitCode != 188):
        if hodInterrupt.isSet():
          raise HodInterruptException()

        failureCount += 1
        if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']):
          self.__log.debug("failed submitting job more than the retries. exiting")
          break
        else:
          # wait a bit before retrying
          time.sleep(self.__cfg['hod']['job-command-failure-interval'])
          if hodInterrupt.isSet():
            raise HodInterruptException()
          self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)

      if self.jobId:
        jobStatus = None
        try:
          jobStatus = self.__check_job_status()
        except HodInterruptException, h:
          self.__log.info(HOD_INTERRUPTED_MESG)
          self.delete_job(self.jobId)
          self.__log.info("Cluster %s removed from queue." % self.jobId)
          raise h
        else:
          if jobStatus == -1:
            self.delete_job(self.jobId);
            status = 4
            return status

        if jobStatus:
          self.__log.info("Cluster Id %s" \
                                                              % self.jobId)
          try:
            self.ringmasterXRS = self.__get_ringmaster_client()
            
            self.__log.debug("Ringmaster at : %s" % self.ringmasterXRS )
            ringClient = None
            if self.ringmasterXRS:
              ringClient =  hodXRClient(self.ringmasterXRS)
                
              hdfsStatus, hdfsAddr, self.hdfsInfo = \
                self.__init_hadoop_service('hdfs', ringClient)
                
              if hdfsStatus:
                self.__log.info("HDFS UI at http://%s" % self.hdfsInfo)
  
                mapredStatus, mapredAddr, self.mapredInfo = \
                  self.__init_hadoop_service('mapred', ringClient)
  
                if mapredStatus:
                  self.__log.info("Mapred UI at http://%s" % self.mapredInfo)
  
                  if self.__cfg['hod'].has_key('update-worker-info') \
                    and self.__cfg['hod']['update-worker-info']:
                    workerInfoMap = {}
                    workerInfoMap['HDFS UI'] = 'http://%s' % self.hdfsInfo
                    workerInfoMap['Mapred UI'] = 'http://%s' % self.mapredInfo
                    if mapredAddr.find(':') != -1:
                      workerInfoMap['Mapred RPC Port'] = mapredAddr.split(':')[1]
                    ret = self.__nodePool.updateWorkerInfo(workerInfoMap, self.jobId)
                    if ret != 0:
                      self.__log.warn('Could not update HDFS and Mapred information.' \
                                      'User Portal may not show relevant information.' \
                                      'Error code=%s' % ret)
  
                  self.__cfg.replace_escape_seqs()
                    
                  # Go generate the client side hadoop-site.xml now
                  # adding final-params as well, just so that conf on 
                  # client-side and server-side are (almost) the same
                  clientParams = None
                  serverParams = {}
                  finalServerParams = {}
  
                  # client-params
                  if self.__cfg['hod'].has_key('client-params'):
                    clientParams = self.__cfg['hod']['client-params']
  
                  # server-params
                  if self.__cfg['gridservice-mapred'].has_key('server-params'):
                    serverParams.update(\
                      self.__cfg['gridservice-mapred']['server-params'])
                  if self.__cfg['gridservice-hdfs'].has_key('server-params'):
                    # note that if there are params in both mapred and hdfs
                    # sections, the ones in hdfs overwirte the ones in mapred
                    serverParams.update(\
                        self.__cfg['gridservice-hdfs']['server-params'])
                    
                  # final-server-params
                  if self.__cfg['gridservice-mapred'].has_key(\
                                                    'final-server-params'):
                    finalServerParams.update(\
                      self.__cfg['gridservice-mapred']['final-server-params'])
                  if self.__cfg['gridservice-hdfs'].has_key(
                                                    'final-server-params'):
                    finalServerParams.update(\
                        self.__cfg['gridservice-hdfs']['final-server-params'])
  
                  clusterFactor = self.__cfg['hod']['cluster-factor']
                  tempDir = self.__cfg['hod']['temp-dir']
                  if not os.path.exists(tempDir):
                    os.makedirs(tempDir)
                  tempDir = os.path.join( tempDir, self.__cfg['hod']['userid']\
                                  + "." + self.jobId )
                  mrSysDir = getMapredSystemDirectory(self.__cfg['hodring']['mapred-system-dir-root'],\
                                      self.__cfg['hod']['userid'], self.jobId)
                  self.__hadoopCfg.gen_site_conf(clusterDir, tempDir, min,\
                            hdfsAddr, mrSysDir, mapredAddr, clientParams,\
                            serverParams, finalServerParams,\
                            clusterFactor)
                  self.__log.info("hadoop-site.xml at %s" % clusterDir)
                  # end of hadoop-site.xml generation
                else:
                  status = 8
              else:
                status = 7  
            else:
              status = 6
            if status != 0:
              self.__log.debug("Cleaning up cluster id %s, as cluster could not be allocated." % self.jobId)
              if ringClient is None:
                self.delete_job(self.jobId)
              else:
                self.__log.debug("Calling rm.stop()")
                ringClient.stopRM()
                self.__log.debug("Returning from rm.stop()")
          except HodInterruptException, h:
            self.__log.info(HOD_INTERRUPTED_MESG)
            if self.ringmasterXRS:
              if ringClient is None:
                ringClient =  hodXRClient(self.ringmasterXRS)
              self.__log.debug("Calling rm.stop()")
              ringClient.stopRM()
              self.__log.debug("Returning from rm.stop()")
              self.__log.info("Cluster Shutdown by informing ringmaster.")
            else:
              self.delete_job(self.jobId)
              self.__log.info("Cluster %s removed from queue directly." % self.jobId)
            raise h
        else:
Exemplo n.º 19
0
    def __init__(self, cfg, log, **kwds):
        """starts nodepool and services"""
        self.download = False
        self.httpServer = None
        self.cfg = cfg
        self.log = log
        self.__hostname = local_fqdn()
        self.workDirs = None

        # ref to the idle job tracker object.
        self.__jtMonitor = None
        self.__idlenessDetected = False
        self.__stopInProgress = False
        self.__isStopped = False  # to let main exit
        self.__exitCode = 0  # exit code with which the ringmaster main method should return

        self.workers_per_ring = self.cfg['ringmaster']['workers_per_ring']

        self.__initialize_signal_handlers()

        sdd = self.cfg['servicedesc']
        gsvc = None
        for key in sdd:
            gsvc = sdd[key]
            break

        npd = self.cfg['nodepooldesc']
        self.np = NodePoolUtil.getNodePool(npd, cfg, log)

        self.log.debug("Getting service ID.")

        self.serviceId = self.np.getServiceId()

        self.log.debug("Got service ID: %s" % self.serviceId)

        self.tarSrcLoc = None
        if self.cfg['ringmaster'].has_key('hadoop-tar-ball'):
            self.download = True
            self.tarSrcLoc = self.cfg['ringmaster']['hadoop-tar-ball']

        self.cd_to_tempdir()

        if (self.download):
            self.__copy_tarball(os.getcwd())
            self.basename = self.__find_tarball_in_dir(os.getcwd())
            if self.basename is None:
                raise Exception('Did not find tarball copied from %s in %s.' %
                                (self.tarSrcLoc, os.getcwd()))

        self.serviceAddr = to_http_url(self.cfg['ringmaster']['svcrgy-addr'])

        self.log.debug("Service registry @ %s" % self.serviceAddr)

        self.serviceClient = hodXRClient(self.serviceAddr)
        self.serviceDict = {}
        try:
            sdl = self.cfg['servicedesc']

            workDirs = self.getWorkDirs(cfg)

            hdfsDesc = sdl['hdfs']
            hdfs = None

            # Determine hadoop Version
            hadoopVers = hadoopVersion(self.__getHadoopDir(), \
                                      self.cfg['hodring']['java-home'], self.log)

            if (hadoopVers['major'] == None) or (hadoopVers['minor'] == None):
                raise Exception(
                    'Could not retrive the version of Hadoop.' +
                    ' Check the Hadoop installation or the value of the hodring.java-home variable.'
                )
            if hdfsDesc.isExternal():
                hdfs = HdfsExternal(hdfsDesc,
                                    workDirs,
                                    version=int(hadoopVers['minor']))
                hdfs.setMasterParams(self.cfg['gridservice-hdfs'])
            else:
                hdfs = Hdfs(hdfsDesc,
                            workDirs,
                            0,
                            version=int(hadoopVers['minor']),
                            workers_per_ring=self.workers_per_ring)

            self.serviceDict[hdfs.getName()] = hdfs

            mrDesc = sdl['mapred']
            mr = None
            if mrDesc.isExternal():
                mr = MapReduceExternal(mrDesc,
                                       workDirs,
                                       version=int(hadoopVers['minor']))
                mr.setMasterParams(self.cfg['gridservice-mapred'])
            else:
                mr = MapReduce(mrDesc,
                               workDirs,
                               1,
                               version=int(hadoopVers['minor']),
                               workers_per_ring=self.workers_per_ring)

            self.serviceDict[mr.getName()] = mr
        except:
            self.log.critical(
                "Exception in creating Hdfs and Map/Reduce descriptor objects: \
                            %s." % get_exception_error_string())
            self.log.debug(get_exception_string())
            raise

        # should not be starting these in a constructor
        ringMasterServer.startService(self.serviceDict, cfg, self.np, log,
                                      self)

        self.rpcserver = ringMasterServer.getAddress()

        self.httpAddress = None
        self.tarAddress = None
        hostname = socket.gethostname()
        if (self.download):
            self.httpServer = threadedHTTPServer(
                hostname, self.cfg['ringmaster']['http-port-range'])

            self.httpServer.serve_forever()
            self.httpAddress = "http://%s:%d/" % (
                self.httpServer.server_address[0],
                self.httpServer.server_address[1])
            self.tarAddress = "%s%s" % (self.httpAddress, self.basename)

            ringMasterServer.instance.logMasterSources.registerTarSource(
                hostname, self.tarAddress)
        else:
            self.log.debug("Download not set.")

        self.log.debug("%s %s %s %s %s" %
                       (self.cfg['ringmaster']['userid'], self.serviceId,
                        self.__hostname, 'ringmaster', 'hod'))

        if self.cfg['ringmaster']['register']:
            if self.httpAddress:
                self.serviceClient.registerService(
                    self.cfg['ringmaster']['userid'], self.serviceId,
                    self.__hostname, 'ringmaster', 'hod', {
                        'xrs': self.rpcserver,
                        'http': self.httpAddress
                    })
            else:
                self.serviceClient.registerService(
                    self.cfg['ringmaster']['userid'], self.serviceId,
                    self.__hostname, 'ringmaster', 'hod', {
                        'xrs': self.rpcserver,
                    })

        self.log.debug("Registered with serivce registry: %s." %
                       self.serviceAddr)

        hodRingPath = os.path.join(cfg['ringmaster']['base-dir'], 'bin',
                                   'hodring')
        hodRingWorkDir = os.path.join(cfg['hodring']['temp-dir'],
                                      'hodring' + '_' + getpass.getuser())

        self.cfg['hodring']['hodring'] = [
            hodRingWorkDir,
        ]
        self.cfg['hodring']['svcrgy-addr'] = self.cfg['ringmaster'][
            'svcrgy-addr']
        self.cfg['hodring']['service-id'] = self.np.getServiceId()

        self.cfg['hodring']['ringmaster-xrs-addr'] = self.__url_to_addr(
            self.rpcserver)

        if (self.tarSrcLoc != None):
            cfg['hodring']['download-addr'] = self.tarAddress

        self.__init_job_tracker_monitor(
            ringMasterServer.instance.logMasterSources)
Exemplo n.º 20
0
 def testSuccess(self):
     global serverPort
     client = hodXRClient('http://localhost:' + str(serverPort),
                          retryRequests=False)
     self.assertEqual(client.testing(), True)
     pass
Exemplo n.º 21
0
  def clusterStart(self, initialize=True):
    """Start a stopped mapreduce/dfs cluster"""
    if initialize:
      self.log.debug('clusterStart Method Invoked - Initialize')
    else:
      self.log.debug('clusterStart Method Invoked - No Initialize')
    try:
      self.log.debug("Creating service registry XML-RPC client.")
      serviceClient = hodXRClient(to_http_url(self._cfg['svcrgy-addr']),
                                  None, None, 0, 0, 0)

      self.log.info("Fetching ringmaster information from service registry.")
      count = 0
      ringXRAddress = None
      while (ringXRAddress == None and count < 3000):
        ringList = serviceClient.getServiceInfo(self._cfg['userid'],
          self._cfg['service-id'], 'ringmaster', 'hod')
        if len(ringList):
          if isinstance(ringList, list):
            ringXRAddress = ringList[0]['xrs']
        count = count + 1

      if ringXRAddress == None:
        raise Exception("Could not get ringmaster XML-RPC server address.")

      self.log.debug("Creating ringmaster XML-RPC client.")
      ringClient = hodXRClient(ringXRAddress, None, None, 0, 0, 0)

      id = self.hostname + "_" + str(os.getpid())

      cmdlist = []
      if initialize:
        if 'download-addr' in self._cfg:
          self.__download_package(ringClient)
        else:
          self.log.debug("Did not find a download address.")
        while (cmdlist == []):
          cmdlist = ringClient.getCommand(id)
      else:
        while (cmdlist == []):
          cmdlist = ringClient.getAdminCommand(id)

      self.log.debug(pformat(cmdlist))
      cmdDescs = []
      for cmds in cmdlist:
        cmdDescs.append(CommandDesc(cmds['dict'], self.log))

      self._cfg['commanddesc'] = cmdDescs

      if initialize:
        self.log.info("Running hadoop commands again... - Initialize")
        self.__run_hadoop_commands()
        masterParams = []
        for k, cmd in self.__running.iteritems():
          self.log.debug(cmd)
          masterParams.extend(cmd.filledInKeyVals)

        self.log.debug("printing getparams")
        self.log.debug(pformat(id))
        self.log.debug(pformat(masterParams))
        # when this is on a required host, the ringMaster already has our masterParams
        if(len(masterParams) > 0):
          ringClient.addMasterParams(id, masterParams)
      else:
        self.log.info("Running hadoop commands again... - No Initialize")
        self.__run_hadoop_commands()

    except:
      self.log.error(get_exception_string())

    return True
Exemplo n.º 22
0
 def testSuccess(self):
   global serverPort
   client = hodXRClient('http://localhost:' + str(serverPort), retryRequests=False)
   self.assertEqual(client.testing(), True)
   pass