Пример #1
0
    def run(self):
        """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
        # Easier access to options
        options = self._options

        # ---------------------------------------------------------------------
        # Connect to the jobs database
        self.logger.info("Connecting to the jobs database")
        cjDAO = ClientJobsDAO.get()

        # Get our worker ID
        self._workerID = cjDAO.getConnectionID()

        if options.clearModels:
            cjDAO.modelsClearAll()

        # -------------------------------------------------------------------------
        # if params were specified on the command line, insert a new job using
        #  them.
        if options.params is not None:
            options.jobID = cjDAO.jobInsert(
                client="hwTest",
                cmdLine="echo 'test mode'",
                params=options.params,
                alreadyRunning=True,
                minimumWorkers=1,
                maximumWorkers=1,
                jobType=cjDAO.JOB_TYPE_HS,
            )
        if options.workerID is not None:
            wID = options.workerID
        else:
            wID = self._workerID

        buildID = Configuration.get("nupic.software.buildNumber", "N/A")
        logPrefix = "<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> " % (buildID, wID, options.jobID)
        ExtendedLogger.setLogPrefix(logPrefix)

        # ---------------------------------------------------------------------
        # Get the search parameters
        # If asked to reset the job status, do that now
        if options.resetJobStatus:
            cjDAO.jobSetFields(
                options.jobID,
                fields={
                    "workerCompletionReason": ClientJobsDAO.CMPL_REASON_SUCCESS,
                    "cancel": False,
                    #'engWorkerState': None
                },
                useConnectionID=False,
                ignoreUnchanged=True,
            )
        jobInfo = cjDAO.jobInfo(options.jobID)
        self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo))))

        # ---------------------------------------------------------------------
        # Instantiate the Hypersearch object, which will handle the logic of
        #  which models to create when we need more to evaluate.
        jobParams = json.loads(jobInfo.params)

        # Validate job params
        jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema", "jobParamsSchema.json")
        validate(jobParams, schemaPath=jsonSchemaPath)

        hsVersion = jobParams.get("hsVersion", None)
        if hsVersion == "v2":
            self._hs = HypersearchV2(
                searchParams=jobParams,
                workerID=self._workerID,
                cjDAO=cjDAO,
                jobID=options.jobID,
                logLevel=options.logLevel,
            )
        else:
            raise RuntimeError("Invalid Hypersearch implementation (%s) specified" % (hsVersion))

        # =====================================================================
        # The main loop.
        try:
            exit = False
            numModelsTotal = 0
            print >>sys.stderr, "reporter:status:Evaluating first model..."
            while not exit:

                # ------------------------------------------------------------------
                # Choose a model to evaluate
                batchSize = 10  # How many to try at a time.
                modelIDToRun = None
                while modelIDToRun is None:

                    if options.modelID is None:
                        # -----------------------------------------------------------------
                        # Get the latest results on all running models and send them to
                        #  the Hypersearch implementation
                        # This calls cjDAO.modelsGetUpdateCounters(), compares the
                        # updateCounters with what we have cached, fetches the results for the
                        # changed and new models, and sends those to the Hypersearch
                        # implementation's self._hs.recordModelProgress() method.
                        self._processUpdatedModels(cjDAO)

                        # --------------------------------------------------------------------
                        # Create a new batch of models
                        (exit, newModels) = self._hs.createModels(numModels=batchSize)
                        if exit:
                            break

                        # No more models left to create, just loop. The _hs is waiting for
                        #   all remaining running models to complete, and may pick up on an
                        #  orphan if it detects one.
                        if len(newModels) == 0:
                            continue

                        # Try and insert one that we will run
                        for (modelParams, modelParamsHash, particleHash) in newModels:
                            jsonModelParams = json.dumps(modelParams)
                            (modelID, ours) = cjDAO.modelInsertAndStart(
                                options.jobID, jsonModelParams, modelParamsHash, particleHash
                            )

                            # Some other worker is already running it, tell the Hypersearch object
                            #  so that it doesn't try and insert it again
                            if not ours:
                                mParamsAndHash = cjDAO.modelsGetParams([modelID])[0]
                                mResult = cjDAO.modelsGetResultAndStatus([modelID])[0]
                                results = mResult.results
                                if results is not None:
                                    results = json.loads(results)

                                modelParams = json.loads(mParamsAndHash.params)
                                particleHash = cjDAO.modelsGetFields(modelID, ["engParticleHash"])[0]
                                particleInst = "%s.%s" % (
                                    modelParams["particleState"]["id"],
                                    modelParams["particleState"]["genIdx"],
                                )
                                self.logger.info(
                                    "Adding model %d to our internal DB "
                                    "because modelInsertAndStart() failed to insert it: "
                                    "paramsHash=%s, particleHash=%s, particleId='%s'",
                                    modelID,
                                    mParamsAndHash.engParamsHash.encode("hex"),
                                    particleHash.encode("hex"),
                                    particleInst,
                                )
                                self._hs.recordModelProgress(
                                    modelID=modelID,
                                    modelParams=modelParams,
                                    modelParamsHash=mParamsAndHash.engParamsHash,
                                    results=results,
                                    completed=(mResult.status == cjDAO.STATUS_COMPLETED),
                                    completionReason=mResult.completionReason,
                                    matured=mResult.engMatured,
                                    numRecords=mResult.numRecords,
                                )
                            else:
                                modelIDToRun = modelID
                                break

                    else:
                        # A specific modelID was passed on the command line
                        modelIDToRun = int(options.modelID)
                        mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun])[0]
                        modelParams = json.loads(mParamsAndHash.params)
                        modelParamsHash = mParamsAndHash.engParamsHash

                        # Make us the worker
                        cjDAO.modelSetFields(modelIDToRun, dict(engWorkerConnId=self._workerID))
                        if False:
                            # Change the hash and params of the old entry so that we can
                            #  create a new model with the same params
                            for attempt in range(1000):
                                paramsHash = hashlib.md5("OrphanParams.%d.%d" % (modelIDToRun, attempt)).digest()
                                particleHash = hashlib.md5("OrphanParticle.%d.%d" % (modelIDToRun, attempt)).digest()
                                try:
                                    cjDAO.modelSetFields(
                                        modelIDToRun, dict(engParamsHash=paramsHash, engParticleHash=particleHash)
                                    )
                                    success = True
                                except:
                                    success = False
                                if success:
                                    break
                            if not success:
                                raise RuntimeError(
                                    "Unexpected failure to change paramsHash and " "particleHash of orphaned model"
                                )

                            (modelIDToRun, ours) = cjDAO.modelInsertAndStart(
                                options.jobID, mParamsAndHash.params, modelParamsHash
                            )

                        # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

                # ---------------------------------------------------------------
                # We have a model, evaluate it now
                # All done?
                if exit:
                    break

                # Run the model now
                self.logger.info(
                    "RUNNING MODEL GID=%d, paramsHash=%s, params=%s",
                    modelIDToRun,
                    modelParamsHash.encode("hex"),
                    modelParams,
                )

                # ---------------------------------------------------------------------
                # Construct model checkpoint GUID for this model:
                # jobParams['persistentJobGUID'] contains the client's (e.g., API Server)
                # persistent, globally-unique model identifier, which is what we need;
                persistentJobGUID = jobParams["persistentJobGUID"]
                assert persistentJobGUID, "persistentJobGUID: %r" % (persistentJobGUID,)

                modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + ("_" + str(modelIDToRun))

                self._hs.runModel(
                    modelID=modelIDToRun,
                    jobID=options.jobID,
                    modelParams=modelParams,
                    modelParamsHash=modelParamsHash,
                    jobsDAO=cjDAO,
                    modelCheckpointGUID=modelCheckpointGUID,
                )

                # TODO: don't increment for orphaned models
                numModelsTotal += 1

                self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs", modelIDToRun, numModelsTotal)
                print >>sys.stderr, "reporter:status:Evaluated %d models..." % (numModelsTotal)
                print >>sys.stderr, "reporter:counter:HypersearchWorker,numModels,1"

                if options.modelID is not None:
                    exit = True
                # ^^^ end while not exit

        finally:
            # Provide Hypersearch instance an opportunity to clean up temporary files
            self._hs.close()

        self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal))
        print >>sys.stderr, "reporter:status:Finished, evaluated %d models" % (numModelsTotal)
        return options.jobID
Пример #2
0
        jobID = None
        completionReason = ClientJobsDAO.CMPL_REASON_SUCCESS
        completionMsg = "Success"

        try:
            jobID = hst.run()
        except Exception, e:
            jobID = hst._options.jobID
            completionReason = ClientJobsDAO.CMPL_REASON_ERROR
            completionMsg = "ERROR: %s" % (e,)
            raise
        finally:
            if jobID is not None:
                cjDAO = ClientJobsDAO.get()
                cjDAO.jobSetCompleted(jobID=jobID, completionReason=completionReason, completionMsg=completionMsg)

    return jobID


if __name__ == "__main__":
    logging.setLoggerClass(ExtendedLogger)
    buildID = Configuration.get("nupic.software.buildNumber", "N/A")
    logPrefix = "<BUILDID=%s, WORKER=HS, WRKID=N/A, JOBID=N/A> " % buildID
    ExtendedLogger.setLogPrefix(logPrefix)

    try:
        main(sys.argv)
    except:
        logging.exception("HypersearchWorker is exiting with unhandled exception; " "argv=%r", sys.argv)
        raise
Пример #3
0
  # Instantiate the DummyWorker and run it
  dum = DummyWorker(options, argv[1:])
  return dum.run()


#############################################################################
if __name__ == "__main__":
  # Init the NuPic logging configuration from the nupic-logging.conf configuration
  # file. This is found either in the NTA_CONF_DIR directory (if defined) or
  # in the 'conf' subdirectory of the NuPic install location.
  initLogging(verbose=True)
  # Replace default logger with our extention
  logging.setLoggerClass(ExtendedLogger)
  logger = logging.getLogger('com.numenta.nupic.cluster.dummyworker.main')
  
  buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
  logPrefix = '<BUILDID=%s, WORKER=DW, WRKID=%s, JOBID=N/A> ' % \
              (buildID, wID)
  ExtendedLogger.setLogPrefix(logPrefix)

  try:
    main(sys.argv)
  except:
    msg = StringIO.StringIO()
    print >>msg, "Exception occurred running DummyWorker: "
    traceback.print_exc(None, msg)
    logger.error(msg.getvalue())
    msg.close()
    del msg
    raise
Пример #4
0
  def run(self):
    """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
    # Easier access to options
    options = self._options

    # ---------------------------------------------------------------------
    # Connect to the jobs database
    self.logger.info("Connecting to the jobs database")
    cjDAO = ClientJobsDAO.get()

    # Get our worker ID
    self._workerID = cjDAO.getConnectionID()


    # -------------------------------------------------------------------------
    # if params were specified on the command line, insert a new job using
    #  them.
    if options.params is not None:
      options.jobID = cjDAO.jobInsert(client='dummy',
                  cmdLine="python -m nupic.swarming.DummyWorker --jobID={JOBID}",
                  params=options.params)



    # ---------------------------------------------------------------------
    # Get the search parameters
    jobInfo = cjDAO.jobInfo(options.jobID)
    self.logger.info("Job info retrieved: %s" % (str(jobInfo)))
    if options.workerID is not None:
      wID = options.workerID
    else:
      wID = self._workerID
    
    buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
    logPrefix = '<BUILDID=%s, WORKER=DW, WRKID=%s, JOBID=%s> ' % \
                (buildID, wID, options.jobID)
    ExtendedLogger.setLogPrefix(logPrefix)


    # ---------------------------------------------------------------------
    # Instantiate the Dummy object, which will handle the logic of
    #  which models to create when we need more to evaluate.
    jobParams = json.loads(jobInfo.params)
    self.logger.info("Job Params: %s" % jobInfo.params)

    # prints the current status
    print >>sys.stderr, "reporter:status:Running dummy worker on job:%d" % \
                                                    (options.jobID)


    self.logger.info("Start of the dummy worker")
    startTime = time.time()
    runTime = jobParams['runTime']
    jobLoad = jobParams['load']
    crashJob = jobParams['crash']

    try:
      while True:
        if runTime != -1 and time.time() > startTime + runTime:
          break
        self.logger.info("In dummy worker")
        if jobLoad == 'heavy':
          # Computationally intensive process
          # Takes 0.8 sec approximately
          numIterations = 30000
          for i in range(numIterations):
            d = numpy.random.rand(1000).sum()
        else:
          time.sleep(0.8)
    except:
      self.logger.exception("DummyWorker exception;")

    if crashJob:
      self.logger.info("Crash of the dummy worker")
      print >>sys.stderr, "reporter:status:Crashed dummy worker..."
      raise RuntimeError("Simulating job crash.")
    else:
      self.logger.info("End of the dummy worker")
      print >>sys.stderr, "reporter:status:Finished dummy worker..."

    #import auxilary
    #auxilary.do_something()

    return options.jobID
Пример #5
0
    def run(self):
        """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
        # Easier access to options
        options = self._options

        # ---------------------------------------------------------------------
        # Connect to the jobs database
        self.logger.info("Connecting to the jobs database")
        cjDAO = ClientJobsDAO.get()

        # Get our worker ID
        self._workerID = cjDAO.getConnectionID()

        if options.clearModels:
            cjDAO.modelsClearAll()

        # -------------------------------------------------------------------------
        # if params were specified on the command line, insert a new job using
        #  them.
        if options.params is not None:
            options.jobID = cjDAO.jobInsert(client='hwTest',
                                            cmdLine="echo 'test mode'",
                                            params=options.params,
                                            alreadyRunning=True,
                                            minimumWorkers=1,
                                            maximumWorkers=1,
                                            jobType=cjDAO.JOB_TYPE_HS)
        if options.workerID is not None:
            wID = options.workerID
        else:
            wID = self._workerID

        buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
        logPrefix = '<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> ' % \
                    (buildID, wID, options.jobID)
        ExtendedLogger.setLogPrefix(logPrefix)

        # ---------------------------------------------------------------------
        # Get the search parameters
        # If asked to reset the job status, do that now
        if options.resetJobStatus:
            cjDAO.jobSetFields(
                options.jobID,
                fields={
                    'workerCompletionReason':
                    ClientJobsDAO.CMPL_REASON_SUCCESS,
                    'cancel': False,
                    #'engWorkerState': None
                },
                useConnectionID=False,
                ignoreUnchanged=True)
        jobInfo = cjDAO.jobInfo(options.jobID)
        self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo))))

        # ---------------------------------------------------------------------
        # Instantiate the swarm object, which will handle the logic of
        #  which models to create when we need more to evaluate.
        jobParams = json.loads(jobInfo.params)

        # Validate job params
        jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema",
                                      "jobParamsSchema.json")
        jsonhelpers.validate(jobParams, schemaPath=jsonSchemaPath)

        hsVersion = jobParams.get('hsVersion', None)
        if hsVersion == 'v2':
            self._hs = SwarmV2(searchParams=jobParams,
                               workerID=self._workerID,
                               cjDAO=cjDAO,
                               jobID=options.jobID,
                               logLevel=options.logLevel)
        else:
            raise RuntimeError("Invalid swarm implementation (%s) specified" \
                                % (hsVersion))

        # =====================================================================
        # The main loop.
        try:
            exit = False
            numModelsTotal = 0
            print >> sys.stderr, "reporter:status:Evaluating first model..."
            while not exit:

                # ------------------------------------------------------------------
                # Choose a model to evaluate
                batchSize = 10  # How many to try at a time.
                modelIDToRun = None
                while modelIDToRun is None:

                    if options.modelID is None:
                        # -----------------------------------------------------------------
                        # Get the latest results on all running models and send them to
                        #  the swarm implementation
                        # This calls cjDAO.modelsGetUpdateCounters(), compares the
                        # updateCounters with what we have cached, fetches the results for the
                        # changed and new models, and sends those to the swarm
                        # implementation's self._hs.recordModelProgress() method.
                        self._processUpdatedModels(cjDAO)

                        # --------------------------------------------------------------------
                        # Create a new batch of models
                        (exit, newModels) = self._hs.createModels(
                            numModels=batchSize)
                        if exit:
                            break

                        # No more models left to create, just loop. The _hs is waiting for
                        #   all remaining running models to complete, and may pick up on an
                        #  orphan if it detects one.
                        if len(newModels) == 0:
                            continue

                        # Try and insert one that we will run
                        for (modelParams, modelParamsHash,
                             particleHash) in newModels:
                            jsonModelParams = json.dumps(modelParams)
                            (modelID, ours) = cjDAO.modelInsertAndStart(
                                options.jobID, jsonModelParams,
                                modelParamsHash, particleHash)

                            # Some other worker is already running it, tell the swarm object
                            #  so that it doesn't try and insert it again
                            if not ours:
                                mParamsAndHash = cjDAO.modelsGetParams(
                                    [modelID])[0]
                                mResult = cjDAO.modelsGetResultAndStatus(
                                    [modelID])[0]
                                results = mResult.results
                                if results is not None:
                                    results = json.loads(results)

                                modelParams = json.loads(mParamsAndHash.params)
                                particleHash = cjDAO.modelsGetFields(
                                    modelID, ['engParticleHash'])[0]
                                particleInst = "%s.%s" % (
                                    modelParams['particleState']['id'],
                                    modelParams['particleState']['genIdx'])
                                self.logger.info("Adding model %d to our internal DB " \
                                      "because modelInsertAndStart() failed to insert it: " \
                                      "paramsHash=%s, particleHash=%s, particleId='%s'", modelID,
                                      mParamsAndHash.engParamsHash.encode('hex'),
                                      particleHash.encode('hex'), particleInst)
                                self._hs.recordModelProgress(
                                    modelID=modelID,
                                    modelParams=modelParams,
                                    modelParamsHash=mParamsAndHash.
                                    engParamsHash,
                                    results=results,
                                    completed=(mResult.status ==
                                               cjDAO.STATUS_COMPLETED),
                                    completionReason=mResult.completionReason,
                                    matured=mResult.engMatured,
                                    numRecords=mResult.numRecords)
                            else:
                                modelIDToRun = modelID
                                break

                    else:
                        # A specific modelID was passed on the command line
                        modelIDToRun = int(options.modelID)
                        mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun
                                                                ])[0]
                        modelParams = json.loads(mParamsAndHash.params)
                        modelParamsHash = mParamsAndHash.engParamsHash

                        # Make us the worker
                        cjDAO.modelSetFields(
                            modelIDToRun, dict(engWorkerConnId=self._workerID))
                        if False:
                            # Change the hash and params of the old entry so that we can
                            #  create a new model with the same params
                            for attempt in range(1000):
                                paramsHash = hashlib.md5(
                                    "OrphanParams.%d.%d" %
                                    (modelIDToRun, attempt)).digest()
                                particleHash = hashlib.md5(
                                    "OrphanParticle.%d.%d" %
                                    (modelIDToRun, attempt)).digest()
                                try:
                                    cjDAO.modelSetFields(
                                        modelIDToRun,
                                        dict(engParamsHash=paramsHash,
                                             engParticleHash=particleHash))
                                    success = True
                                except:
                                    success = False
                                if success:
                                    break
                            if not success:
                                raise RuntimeError(
                                    "Unexpected failure to change paramsHash and "
                                    "particleHash of orphaned model")

                            (modelIDToRun, ours) = cjDAO.modelInsertAndStart(
                                options.jobID, mParamsAndHash.params,
                                modelParamsHash)

                        # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

                # ---------------------------------------------------------------
                # We have a model, evaluate it now
                # All done?
                if exit:
                    break

                # Run the model now
                self.logger.info(
                    "RUNNING MODEL GID=%d, paramsHash=%s, params=%s",
                    modelIDToRun, modelParamsHash.encode('hex'), modelParams)

                # ---------------------------------------------------------------------
                # Construct model checkpoint GUID for this model:
                # jobParams['persistentJobGUID'] contains the client's (e.g., API Server)
                # persistent, globally-unique model identifier, which is what we need;
                persistentJobGUID = jobParams['persistentJobGUID']
                assert persistentJobGUID, "persistentJobGUID: %r" % (
                    persistentJobGUID, )

                modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + (
                    '_' + str(modelIDToRun))

                self._hs.runModel(modelID=modelIDToRun,
                                  jobID=options.jobID,
                                  modelParams=modelParams,
                                  modelParamsHash=modelParamsHash,
                                  jobsDAO=cjDAO,
                                  modelCheckpointGUID=modelCheckpointGUID)

                # TODO: don't increment for orphaned models
                numModelsTotal += 1

                self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs",
                                 modelIDToRun, numModelsTotal)
                print >>sys.stderr, "reporter:status:Evaluated %d models..." % \
                                            (numModelsTotal)
                print >> sys.stderr, "reporter:counter:swarmWorker,numModels,1"

                if options.modelID is not None:
                    exit = True
                # ^^^ end while not exit

        finally:
            # Provide swarm instance an opportunity to clean up temporary files
            self._hs.close()

        self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal))
        print >> sys.stderr, "reporter:status:Finished, evaluated %d models" % (
            numModelsTotal)
        return options.jobID