def run(self): """ Run this worker. Parameters: ---------------------------------------------------------------------- retval: jobID of the job we ran. This is used by unit test code when calling this working using the --params command line option (which tells this worker to insert the job itself). """ # Easier access to options options = self._options # --------------------------------------------------------------------- # Connect to the jobs database self.logger.info("Connecting to the jobs database") cjDAO = ClientJobsDAO.get() # Get our worker ID self._workerID = cjDAO.getConnectionID() if options.clearModels: cjDAO.modelsClearAll() # ------------------------------------------------------------------------- # if params were specified on the command line, insert a new job using # them. if options.params is not None: options.jobID = cjDAO.jobInsert( client="hwTest", cmdLine="echo 'test mode'", params=options.params, alreadyRunning=True, minimumWorkers=1, maximumWorkers=1, jobType=cjDAO.JOB_TYPE_HS, ) if options.workerID is not None: wID = options.workerID else: wID = self._workerID buildID = Configuration.get("nupic.software.buildNumber", "N/A") logPrefix = "<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> " % (buildID, wID, options.jobID) ExtendedLogger.setLogPrefix(logPrefix) # --------------------------------------------------------------------- # Get the search parameters # If asked to reset the job status, do that now if options.resetJobStatus: cjDAO.jobSetFields( options.jobID, fields={ "workerCompletionReason": ClientJobsDAO.CMPL_REASON_SUCCESS, "cancel": False, #'engWorkerState': None }, useConnectionID=False, ignoreUnchanged=True, ) jobInfo = cjDAO.jobInfo(options.jobID) self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo)))) # --------------------------------------------------------------------- # Instantiate the Hypersearch object, which will handle the logic of # which models to create when we need more to evaluate. jobParams = json.loads(jobInfo.params) # Validate job params jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema", "jobParamsSchema.json") validate(jobParams, schemaPath=jsonSchemaPath) hsVersion = jobParams.get("hsVersion", None) if hsVersion == "v2": self._hs = HypersearchV2( searchParams=jobParams, workerID=self._workerID, cjDAO=cjDAO, jobID=options.jobID, logLevel=options.logLevel, ) else: raise RuntimeError("Invalid Hypersearch implementation (%s) specified" % (hsVersion)) # ===================================================================== # The main loop. try: exit = False numModelsTotal = 0 print >>sys.stderr, "reporter:status:Evaluating first model..." while not exit: # ------------------------------------------------------------------ # Choose a model to evaluate batchSize = 10 # How many to try at a time. modelIDToRun = None while modelIDToRun is None: if options.modelID is None: # ----------------------------------------------------------------- # Get the latest results on all running models and send them to # the Hypersearch implementation # This calls cjDAO.modelsGetUpdateCounters(), compares the # updateCounters with what we have cached, fetches the results for the # changed and new models, and sends those to the Hypersearch # implementation's self._hs.recordModelProgress() method. self._processUpdatedModels(cjDAO) # -------------------------------------------------------------------- # Create a new batch of models (exit, newModels) = self._hs.createModels(numModels=batchSize) if exit: break # No more models left to create, just loop. The _hs is waiting for # all remaining running models to complete, and may pick up on an # orphan if it detects one. if len(newModels) == 0: continue # Try and insert one that we will run for (modelParams, modelParamsHash, particleHash) in newModels: jsonModelParams = json.dumps(modelParams) (modelID, ours) = cjDAO.modelInsertAndStart( options.jobID, jsonModelParams, modelParamsHash, particleHash ) # Some other worker is already running it, tell the Hypersearch object # so that it doesn't try and insert it again if not ours: mParamsAndHash = cjDAO.modelsGetParams([modelID])[0] mResult = cjDAO.modelsGetResultAndStatus([modelID])[0] results = mResult.results if results is not None: results = json.loads(results) modelParams = json.loads(mParamsAndHash.params) particleHash = cjDAO.modelsGetFields(modelID, ["engParticleHash"])[0] particleInst = "%s.%s" % ( modelParams["particleState"]["id"], modelParams["particleState"]["genIdx"], ) self.logger.info( "Adding model %d to our internal DB " "because modelInsertAndStart() failed to insert it: " "paramsHash=%s, particleHash=%s, particleId='%s'", modelID, mParamsAndHash.engParamsHash.encode("hex"), particleHash.encode("hex"), particleInst, ) self._hs.recordModelProgress( modelID=modelID, modelParams=modelParams, modelParamsHash=mParamsAndHash.engParamsHash, results=results, completed=(mResult.status == cjDAO.STATUS_COMPLETED), completionReason=mResult.completionReason, matured=mResult.engMatured, numRecords=mResult.numRecords, ) else: modelIDToRun = modelID break else: # A specific modelID was passed on the command line modelIDToRun = int(options.modelID) mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun])[0] modelParams = json.loads(mParamsAndHash.params) modelParamsHash = mParamsAndHash.engParamsHash # Make us the worker cjDAO.modelSetFields(modelIDToRun, dict(engWorkerConnId=self._workerID)) if False: # Change the hash and params of the old entry so that we can # create a new model with the same params for attempt in range(1000): paramsHash = hashlib.md5("OrphanParams.%d.%d" % (modelIDToRun, attempt)).digest() particleHash = hashlib.md5("OrphanParticle.%d.%d" % (modelIDToRun, attempt)).digest() try: cjDAO.modelSetFields( modelIDToRun, dict(engParamsHash=paramsHash, engParticleHash=particleHash) ) success = True except: success = False if success: break if not success: raise RuntimeError( "Unexpected failure to change paramsHash and " "particleHash of orphaned model" ) (modelIDToRun, ours) = cjDAO.modelInsertAndStart( options.jobID, mParamsAndHash.params, modelParamsHash ) # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # --------------------------------------------------------------- # We have a model, evaluate it now # All done? if exit: break # Run the model now self.logger.info( "RUNNING MODEL GID=%d, paramsHash=%s, params=%s", modelIDToRun, modelParamsHash.encode("hex"), modelParams, ) # --------------------------------------------------------------------- # Construct model checkpoint GUID for this model: # jobParams['persistentJobGUID'] contains the client's (e.g., API Server) # persistent, globally-unique model identifier, which is what we need; persistentJobGUID = jobParams["persistentJobGUID"] assert persistentJobGUID, "persistentJobGUID: %r" % (persistentJobGUID,) modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + ("_" + str(modelIDToRun)) self._hs.runModel( modelID=modelIDToRun, jobID=options.jobID, modelParams=modelParams, modelParamsHash=modelParamsHash, jobsDAO=cjDAO, modelCheckpointGUID=modelCheckpointGUID, ) # TODO: don't increment for orphaned models numModelsTotal += 1 self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs", modelIDToRun, numModelsTotal) print >>sys.stderr, "reporter:status:Evaluated %d models..." % (numModelsTotal) print >>sys.stderr, "reporter:counter:HypersearchWorker,numModels,1" if options.modelID is not None: exit = True # ^^^ end while not exit finally: # Provide Hypersearch instance an opportunity to clean up temporary files self._hs.close() self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal)) print >>sys.stderr, "reporter:status:Finished, evaluated %d models" % (numModelsTotal) return options.jobID
jobID = None completionReason = ClientJobsDAO.CMPL_REASON_SUCCESS completionMsg = "Success" try: jobID = hst.run() except Exception, e: jobID = hst._options.jobID completionReason = ClientJobsDAO.CMPL_REASON_ERROR completionMsg = "ERROR: %s" % (e,) raise finally: if jobID is not None: cjDAO = ClientJobsDAO.get() cjDAO.jobSetCompleted(jobID=jobID, completionReason=completionReason, completionMsg=completionMsg) return jobID if __name__ == "__main__": logging.setLoggerClass(ExtendedLogger) buildID = Configuration.get("nupic.software.buildNumber", "N/A") logPrefix = "<BUILDID=%s, WORKER=HS, WRKID=N/A, JOBID=N/A> " % buildID ExtendedLogger.setLogPrefix(logPrefix) try: main(sys.argv) except: logging.exception("HypersearchWorker is exiting with unhandled exception; " "argv=%r", sys.argv) raise
# Instantiate the DummyWorker and run it dum = DummyWorker(options, argv[1:]) return dum.run() ############################################################################# if __name__ == "__main__": # Init the NuPic logging configuration from the nupic-logging.conf configuration # file. This is found either in the NTA_CONF_DIR directory (if defined) or # in the 'conf' subdirectory of the NuPic install location. initLogging(verbose=True) # Replace default logger with our extention logging.setLoggerClass(ExtendedLogger) logger = logging.getLogger('com.numenta.nupic.cluster.dummyworker.main') buildID = Configuration.get('nupic.software.buildNumber', 'N/A') logPrefix = '<BUILDID=%s, WORKER=DW, WRKID=%s, JOBID=N/A> ' % \ (buildID, wID) ExtendedLogger.setLogPrefix(logPrefix) try: main(sys.argv) except: msg = StringIO.StringIO() print >>msg, "Exception occurred running DummyWorker: " traceback.print_exc(None, msg) logger.error(msg.getvalue()) msg.close() del msg raise
def run(self): """ Run this worker. Parameters: ---------------------------------------------------------------------- retval: jobID of the job we ran. This is used by unit test code when calling this working using the --params command line option (which tells this worker to insert the job itself). """ # Easier access to options options = self._options # --------------------------------------------------------------------- # Connect to the jobs database self.logger.info("Connecting to the jobs database") cjDAO = ClientJobsDAO.get() # Get our worker ID self._workerID = cjDAO.getConnectionID() # ------------------------------------------------------------------------- # if params were specified on the command line, insert a new job using # them. if options.params is not None: options.jobID = cjDAO.jobInsert(client='dummy', cmdLine="python -m nupic.swarming.DummyWorker --jobID={JOBID}", params=options.params) # --------------------------------------------------------------------- # Get the search parameters jobInfo = cjDAO.jobInfo(options.jobID) self.logger.info("Job info retrieved: %s" % (str(jobInfo))) if options.workerID is not None: wID = options.workerID else: wID = self._workerID buildID = Configuration.get('nupic.software.buildNumber', 'N/A') logPrefix = '<BUILDID=%s, WORKER=DW, WRKID=%s, JOBID=%s> ' % \ (buildID, wID, options.jobID) ExtendedLogger.setLogPrefix(logPrefix) # --------------------------------------------------------------------- # Instantiate the Dummy object, which will handle the logic of # which models to create when we need more to evaluate. jobParams = json.loads(jobInfo.params) self.logger.info("Job Params: %s" % jobInfo.params) # prints the current status print >>sys.stderr, "reporter:status:Running dummy worker on job:%d" % \ (options.jobID) self.logger.info("Start of the dummy worker") startTime = time.time() runTime = jobParams['runTime'] jobLoad = jobParams['load'] crashJob = jobParams['crash'] try: while True: if runTime != -1 and time.time() > startTime + runTime: break self.logger.info("In dummy worker") if jobLoad == 'heavy': # Computationally intensive process # Takes 0.8 sec approximately numIterations = 30000 for i in range(numIterations): d = numpy.random.rand(1000).sum() else: time.sleep(0.8) except: self.logger.exception("DummyWorker exception;") if crashJob: self.logger.info("Crash of the dummy worker") print >>sys.stderr, "reporter:status:Crashed dummy worker..." raise RuntimeError("Simulating job crash.") else: self.logger.info("End of the dummy worker") print >>sys.stderr, "reporter:status:Finished dummy worker..." #import auxilary #auxilary.do_something() return options.jobID
def run(self): """ Run this worker. Parameters: ---------------------------------------------------------------------- retval: jobID of the job we ran. This is used by unit test code when calling this working using the --params command line option (which tells this worker to insert the job itself). """ # Easier access to options options = self._options # --------------------------------------------------------------------- # Connect to the jobs database self.logger.info("Connecting to the jobs database") cjDAO = ClientJobsDAO.get() # Get our worker ID self._workerID = cjDAO.getConnectionID() if options.clearModels: cjDAO.modelsClearAll() # ------------------------------------------------------------------------- # if params were specified on the command line, insert a new job using # them. if options.params is not None: options.jobID = cjDAO.jobInsert(client='hwTest', cmdLine="echo 'test mode'", params=options.params, alreadyRunning=True, minimumWorkers=1, maximumWorkers=1, jobType=cjDAO.JOB_TYPE_HS) if options.workerID is not None: wID = options.workerID else: wID = self._workerID buildID = Configuration.get('nupic.software.buildNumber', 'N/A') logPrefix = '<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> ' % \ (buildID, wID, options.jobID) ExtendedLogger.setLogPrefix(logPrefix) # --------------------------------------------------------------------- # Get the search parameters # If asked to reset the job status, do that now if options.resetJobStatus: cjDAO.jobSetFields( options.jobID, fields={ 'workerCompletionReason': ClientJobsDAO.CMPL_REASON_SUCCESS, 'cancel': False, #'engWorkerState': None }, useConnectionID=False, ignoreUnchanged=True) jobInfo = cjDAO.jobInfo(options.jobID) self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo)))) # --------------------------------------------------------------------- # Instantiate the swarm object, which will handle the logic of # which models to create when we need more to evaluate. jobParams = json.loads(jobInfo.params) # Validate job params jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema", "jobParamsSchema.json") jsonhelpers.validate(jobParams, schemaPath=jsonSchemaPath) hsVersion = jobParams.get('hsVersion', None) if hsVersion == 'v2': self._hs = SwarmV2(searchParams=jobParams, workerID=self._workerID, cjDAO=cjDAO, jobID=options.jobID, logLevel=options.logLevel) else: raise RuntimeError("Invalid swarm implementation (%s) specified" \ % (hsVersion)) # ===================================================================== # The main loop. try: exit = False numModelsTotal = 0 print >> sys.stderr, "reporter:status:Evaluating first model..." while not exit: # ------------------------------------------------------------------ # Choose a model to evaluate batchSize = 10 # How many to try at a time. modelIDToRun = None while modelIDToRun is None: if options.modelID is None: # ----------------------------------------------------------------- # Get the latest results on all running models and send them to # the swarm implementation # This calls cjDAO.modelsGetUpdateCounters(), compares the # updateCounters with what we have cached, fetches the results for the # changed and new models, and sends those to the swarm # implementation's self._hs.recordModelProgress() method. self._processUpdatedModels(cjDAO) # -------------------------------------------------------------------- # Create a new batch of models (exit, newModels) = self._hs.createModels( numModels=batchSize) if exit: break # No more models left to create, just loop. The _hs is waiting for # all remaining running models to complete, and may pick up on an # orphan if it detects one. if len(newModels) == 0: continue # Try and insert one that we will run for (modelParams, modelParamsHash, particleHash) in newModels: jsonModelParams = json.dumps(modelParams) (modelID, ours) = cjDAO.modelInsertAndStart( options.jobID, jsonModelParams, modelParamsHash, particleHash) # Some other worker is already running it, tell the swarm object # so that it doesn't try and insert it again if not ours: mParamsAndHash = cjDAO.modelsGetParams( [modelID])[0] mResult = cjDAO.modelsGetResultAndStatus( [modelID])[0] results = mResult.results if results is not None: results = json.loads(results) modelParams = json.loads(mParamsAndHash.params) particleHash = cjDAO.modelsGetFields( modelID, ['engParticleHash'])[0] particleInst = "%s.%s" % ( modelParams['particleState']['id'], modelParams['particleState']['genIdx']) self.logger.info("Adding model %d to our internal DB " \ "because modelInsertAndStart() failed to insert it: " \ "paramsHash=%s, particleHash=%s, particleId='%s'", modelID, mParamsAndHash.engParamsHash.encode('hex'), particleHash.encode('hex'), particleInst) self._hs.recordModelProgress( modelID=modelID, modelParams=modelParams, modelParamsHash=mParamsAndHash. engParamsHash, results=results, completed=(mResult.status == cjDAO.STATUS_COMPLETED), completionReason=mResult.completionReason, matured=mResult.engMatured, numRecords=mResult.numRecords) else: modelIDToRun = modelID break else: # A specific modelID was passed on the command line modelIDToRun = int(options.modelID) mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun ])[0] modelParams = json.loads(mParamsAndHash.params) modelParamsHash = mParamsAndHash.engParamsHash # Make us the worker cjDAO.modelSetFields( modelIDToRun, dict(engWorkerConnId=self._workerID)) if False: # Change the hash and params of the old entry so that we can # create a new model with the same params for attempt in range(1000): paramsHash = hashlib.md5( "OrphanParams.%d.%d" % (modelIDToRun, attempt)).digest() particleHash = hashlib.md5( "OrphanParticle.%d.%d" % (modelIDToRun, attempt)).digest() try: cjDAO.modelSetFields( modelIDToRun, dict(engParamsHash=paramsHash, engParticleHash=particleHash)) success = True except: success = False if success: break if not success: raise RuntimeError( "Unexpected failure to change paramsHash and " "particleHash of orphaned model") (modelIDToRun, ours) = cjDAO.modelInsertAndStart( options.jobID, mParamsAndHash.params, modelParamsHash) # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # --------------------------------------------------------------- # We have a model, evaluate it now # All done? if exit: break # Run the model now self.logger.info( "RUNNING MODEL GID=%d, paramsHash=%s, params=%s", modelIDToRun, modelParamsHash.encode('hex'), modelParams) # --------------------------------------------------------------------- # Construct model checkpoint GUID for this model: # jobParams['persistentJobGUID'] contains the client's (e.g., API Server) # persistent, globally-unique model identifier, which is what we need; persistentJobGUID = jobParams['persistentJobGUID'] assert persistentJobGUID, "persistentJobGUID: %r" % ( persistentJobGUID, ) modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + ( '_' + str(modelIDToRun)) self._hs.runModel(modelID=modelIDToRun, jobID=options.jobID, modelParams=modelParams, modelParamsHash=modelParamsHash, jobsDAO=cjDAO, modelCheckpointGUID=modelCheckpointGUID) # TODO: don't increment for orphaned models numModelsTotal += 1 self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs", modelIDToRun, numModelsTotal) print >>sys.stderr, "reporter:status:Evaluated %d models..." % \ (numModelsTotal) print >> sys.stderr, "reporter:counter:swarmWorker,numModels,1" if options.modelID is not None: exit = True # ^^^ end while not exit finally: # Provide swarm instance an opportunity to clean up temporary files self._hs.close() self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal)) print >> sys.stderr, "reporter:status:Finished, evaluated %d models" % ( numModelsTotal) return options.jobID