Exemplo n.º 1
0
  def call_next(self, delay=.75):
    """Reschedules self for next job (if Slurm Job) with a delay
    percentage of current time. Delay is float between 0 and 1
    """

    if self.slurm_id is None:
      logging.warning("Skipping self-invoked Call Next (not a Slurm Job)")
    else:
      ts = int(dt.now().timestamp())
      total_time = int(self.ttl) - ts
      next_start_delay = round(delay * total_time) 
      logging.debug('TIMES')
      logging.debug('TTL %f', self.ttl)
      logging.debug('TS  %d', ts)
      logging.info('Redis Service will schedule next job to begin in %d seconds', next_start_delay)
      # for k, v in os.environ.items():
      #   print(k, ':     ', v)
      params = {}
      params['time']     = self.jobinfo['TimeLimit']
      params['exclude']  = self.jobinfo['NodeList']
      params['nodes']    = os.getenv('SLURM_JOB_NUM_NODES')
      params['cpus-per-task'] = os.getenv('SLURM_CPUS_PER_TASK')
      params['partition']= os.getenv('SLURM_JOB_PARTITION')
      params['job-name'] = os.getenv('SLURM_JOB_NAME')
      params['workdir']  = os.getcwd()
      params['begin'] = 'now+%d' % (next_start_delay)
      params['output'] = '/home-1/[email protected]/ddc/osvc-redis-%%j.out'
      logging.debug('CALL NEXT for next Redis Server Handover:  %s', str(params))
      slurm.sbatch(taskid=params['job-name'],
              options = params,
              modules = set(['redis']),
              cmd = "src/overlay.py --name=%s redis start" % self._name_app)
Exemplo n.º 2
0
    def call_next(self, delay=.75):
        """Reschedules self for next job (if Slurm Job) with a delay
    percentage of current time. Delay is float between 0 and 1
    """

        if self.slurm_id is None:
            logging.warning(
                "Skipping self-invoked Call Next (not a Slurm Job)")
        else:
            ts = int(dt.now().timestamp())
            total_time = int(self.ttl) - ts
            next_start_delay = round(delay * total_time)
            logging.debug('TIMES')
            logging.debug('TTL %f', self.ttl)
            logging.debug('TS  %d', ts)
            logging.info(
                'Redis Service will schedule next job to begin in %d seconds',
                next_start_delay)
            # for k, v in os.environ.items():
            #   print(k, ':     ', v)
            params = {}
            params['time'] = self.jobinfo['TimeLimit']
            params['exclude'] = self.jobinfo['NodeList']
            params['nodes'] = os.getenv('SLURM_JOB_NUM_NODES')
            params['cpus-per-task'] = os.getenv('SLURM_CPUS_PER_TASK')
            params['partition'] = os.getenv('SLURM_JOB_PARTITION')
            params['job-name'] = os.getenv('SLURM_JOB_NAME')
            params['workdir'] = os.getcwd()
            params['begin'] = 'now+%d' % (next_start_delay)
            params['output'] = '/home-1/[email protected]/ddc/osvc-redis-%%j.out'
            logging.debug('CALL NEXT for next Redis Server Handover:  %s',
                          str(params))
            slurm.sbatch(taskid=params['job-name'],
                         options=params,
                         modules=set(['redis']),
                         cmd="src/overlay.py --name=%s redis start" %
                         self._name_app)
Exemplo n.º 3
0
  def run(self):
    args = self.parser.parse_args()

    settings = systemsettings()
    self.experiment_number = settings.EXPERIMENT_NUMBER

    logging.info("APPLICATION:    %s", settings.APPL_LABEL)
    logging.info("WORKDIR:  %s", settings.WORKDIR)

    # Read in Slurm params  (TODO: Move to abstract slurm call)
    if self.job_id is None:
      self.job_id   = os.getenv('JOB_NAME')
    self.slurm_id = os.getenv('SLURM_JOB_ID')

    logging.debug('EnVars')

    for i in ['SBATCH_JOBID', 'SBATCH_JOB_NAME', 'SLURM_JOB_ID', 'SLURM_JOBID', 'SLURM_JOB_NAME']:
      logging.debug('    %s : %s', i, os.getenv(i))

    logging.info("JOB NAME :  %s", str(self.job_id))
    logging.info("SLURM JOB:  %s", str(self.slurm_id))

    if args.debug:
      logging.debug("DEBUGGING: %s", self.name)

    if args.single:
      logging.debug("Macrothread running in single exection Mode (only 1 manager will execute).")
      self.singleuse = True

    if args.init:
      sys.exit(0)

    # Both Worker & Manager need catalog to run; load it here and import schema
    retry = 3
    connected = False
    while retry > 0:
      retry -= 1
      logging.info('Trying to estabish connection to the Catalog Service')
      try:
        self.catalog = RedisClient(settings.name)
        if self.catalog.isconnected and self.catalog.ping():
          logging.info('Catalog service is connected')
          connected = True
          break
        logging.info("Catalog service is not running. Trying to start the service now")
        self.start_local_catalog()
      except (redis.RedisError, OverlayNotAvailable) as e:
        self.catalog = None
        self.start_local_catalog()

    if not connected:
      # If the catalog is unavailable. Fail this thread and re-schedule it
      if args.workinput:
        relaunch_cmd = "python3 %s -c %s -w" % (self.fname, self.config, args.workinput)
      else:
        self.slurmParams['cpus-per-task'] = 1
        relaunch_cmd = "python3 %s -c %s" % (self.fname, self.config)

      self.slurmParams['job-name'] = self.job_id
      slurm.sbatch(taskid =self.slurmParams['job-name'],
                options   = self.slurmParams,
                modules   = self.modules,
                cmd       = relaunch_cmd)
      # NOTE: This should be handled in an exception (need to figure out which one)
      #  And then raise a custom OverlayConnectionError here
      return

    #  LOAD Some self-bootstraping meta-data (if not alread loaded):
    mthread_key = 'macrothread:' + self.name
    if not self.catalog.exists(mthread_key):

      self.catalog.hmset(mthread_key, {'fname': self.fname})

    self.catalog.loadSchema()   # Should this be called from within the catalog module?

    # Load meta-data about registered mactrothreads
    self.data['macrothread'] = {}
    for key in self.catalog.keys('macrothread'):
      mt_name = key.split(':')[1]
      self.data['macrothread'][mt_thread] = self.catalog.hgetall(key)

    # Load current STATE from Catalog
    logging.info("Loading Thread State for from catalog:")

    # Load Standard set of simple params (init and simulation vals)
    # By default these are immutable. For any vals which may change or update
    # during execution, they should be explicitly set in the _mut or _append
    self.load(list(settings.state.keys()))
    self.load(list(settings.sim_params.keys()))

    # Load additional State values  
    self.load(self._mut, self._immut, self._append)

    if args.workinput:
      logging.debug("Running worker.")
      self.worker(args.workinput)
    else:
      self.manager()

    if self.localcatalogserver:
      logging.debug("This thread is running the catalog. Waiting on local service to terminate...")
      self.localcatalogserver.join()
      self.localcatalogserver = None
Exemplo n.º 4
0
  def manager(self, fork=False):

    logging.debug("\n==========================\n  MANAGER:  %s", self.name)


    # Check global termination:
    term_flag = self.data['terminate']
    if term_flag and term_flag.lower() in ['halt', 'stop', 'now']:
      logging.info('RECEIVED TERMINATION FLAG. Shutting down')
      sys.exit(0)

    # Load Data from Thread's State and Upstream thread
    if self.upstream:
      logging.debug("Loading upstream data: %s", self.upstream)
      self.load(self.upstream)

    # Check for termination  
    if self.term():
      logging.info('TERMINATION condition for ' + self.name)
      return 0

    # Set Elasticity Policy
    self.configElasPolicy()

    # Note: Manager can become a service daemon. Thus, we allow the manager
    #  to run along with the monitor process and assume the manager overhead
    #  is small enough to not interfere. Eventually, this will be threaded
    #  differently by preventing the local service (within this object's
    #  context) from running while the manager performs its split() function
    #  worker dispatching. The worker (below) starts a local service
    #  for reading, reads in the state, stops it, performs its work, and then
    #  starts it for writing and remain alive to monitor......
    #  Hence, we'll eventually change this next line to False or some other
    #  state value or we'll just let this manager become the monitor and
    #  provide the service which means it will need to immediate re-schedule
    #  itself
    # self.catalogPersistanceState = True
    # if self.localcatalogserver and not self.catalogPersistanceState:
    #   self.catalog.stop()
    #   self.localcatalogserver = None

    #  TODO:  Det if manager should load entire input data set, make this abstract, or
    #     push into UDF portion
    #  Defer can return either a list of items to push back or a "split" value to
    #  perform an in-line data trim on the key-store DB (optimiation)
    immed, defer  = self.split()

    #  Manager oversee's id assignment. 
    idlabel = 'id_%s' % self.name
    self.catalog.incr(idlabel)
    nextid = self.catalog.get(idlabel)

    # first ID check 
    nextid = 0 if nextid is None else int(nextid)
    myid = self.fromMID()
    if myid is None:
      myid = int(nextid - 1)

    # No Jobs to run.... Delay and then rerun later
    if len(immed) == 0:
      delay = int(self.delay)
      logging.debug("MANAGER %s: No Available input data. Delaying %d seconds and rescheduling...." % (self.name, delay))
      self.slurmParams['begin'] = 'now+%d' % delay

    # Dispatch Workers
    else:
      workernum = 1
      delay = 180 + self.delay  

      # Set baseline slurm params and modules (to allow for dynamic disatching)
      baseline_param = copy.deepcopy(self.slurmParams)
      baseline_mods  = copy.deepcopy(self.modules)
      for i in immed:
        logging.debug("%s: scheduling worker, input=%s", self.name, i)
        self.preparejob(i)
        self.slurmParams['job-name'] = self.toWID(myid, workernum)
        slurm.sbatch(taskid=self.slurmParams['job-name'],
            options = self.slurmParams,
            modules = self.modules,
            cmd = "python3 %s -c %s -w %s" % (self.fname, self.config, str(i)))
        workernum += 1
        # Reset params and mods
        self.slurmParams = copy.deepcopy(baseline_param)
        self.modules     = copy.deepcopy(baseline_mods)

    # Single use exit:
    if self.singleuse:
      logging.debug("SINGLE USE INVOKED. No more managers will run ")
      return 0


    # Elas Policy to control manager rescheduling
    self.slurmParams['begin'] = 'now+%d' % delay
    self.slurmParams['job-name'] = self.toMID(nextid)
    self.slurmParams['cpus-per-task'] = 1
    slurm.sbatch(taskid =self.slurmParams['job-name'],
              options   = self.slurmParams,
              modules   = self.modules,
              cmd = "python3 %s -c %s" % (self.fname, self.config))

    # TODO: Alternate manager rescheduling:  Trigger Based
    #   use after:job_id[:jobid...] w/ #SBATCH --dependency=<dependency_list>

    # Consume upstream input data
    logging.debug('Consuming Upstream Data....')
    if isinstance(defer, list):
      logging.debug('Deferring a list and removing %d items tasked to run immed', len(immed))
      self.catalog.removeItems(self.upstream, immed)
    elif defer is not None:
      logging.debug('Slicing %d items', defer)
      self.catalog.slice(self.upstream, defer)

    # Other interal thread state is saved back to catalog
    self.save(self._mut)

    logging.debug("==========================")
    return 0