예제 #1
0
파일: initialize.py 프로젝트: DaMSL/ddc
def makejobconfig(catalog):
  logging.info('Seeding 1 job')
  settings = systemsettings()

  # Generate new set of params/coords
  pdbfile, dcdfile = deshaw.getHistoricalTrajectory_prot(0)
  traj = md.load(dcdfile, top=pdbfile, frame=0)
  jcID, params = generateNewJC(traj)

  dcdfreq = catalog.get('dcdfreq')
  runtime = catalog.get('runtime')
  sim_step_size = catalog.get('sim_step_size')

  # Update Additional JC Params and Decision History, as needed
  config = dict(params,
      name    = jcID,
      runtime = runtime,
      dcdfreq = dcdfreq,
      interval = dcdfreq * sim_step_size,
      temp    = 310,
      timestep = 0,
      gc      = 1,
      origin  = 'deshaw',
      src_index = 0,
      src_bin  = (0, 0),
      application   = settings.name)
  logging.info("New Simulation Job Created: %s", jcID)
  for k, v in config.items():
    logging.debug("   %s:  %s", k, str(v))
  catalog.rpush('jcqueue', jcID)
  catalog.hmset(wrapKey('jc', jcID), config)
예제 #2
0
파일: ctl_mv.py 프로젝트: DaMSL/ddc
    def execute(self, thru_index):
      """Executing the Controler Algorithm. Load pre-analyzed lower dimensional
      subspaces, process user query and identify the sampling space with 
      corresponding distribution function for each user query. Calculate 
      convergence rates, run sampler, and then execute fairness policy to
      distribute resources among users' sampled values.
      """
      logging.debug('CTL MT')

    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")

      np.set_printoptions(precision=4, linewidth=150)

      self.data['timestep'] += 1
      logging.info('TIMESTEP: %d', self.data['timestep'])

      settings = systemsettings()
      bench = microbench('ctl_%s' % settings.name, self.seqNumFromID())
      stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID())

      # Connect to the cache
      self.cacheclient = CacheClient(settings.APPL_LABEL)

      # create the "binlist":
      numLabels = self.data['numLabels']
      numresources = self.data['numresources']


    # LOAD all new subspaces (?) and values

      ##### BARRIER
      self.wait_catalog()

      # Load new RMS Labels -- load all for now
      bench.start()
      logging.debug('Loading RMS Labels')
      start_index = max(0, self.data['ctlIndexHead'])

      # labeled_pts_rms = self.catalog.lrange('label:rms', self.data['ctlIndexHead'], thru_index)
      logging.debug(" Start_index=%d,  thru_index=%d,   ctlIndexHead=%d", start_index, thru_index, self.data['ctlIndexHead'])

      feallist = [np.fromstring(i) for i in self.catalog.lrange('subspace:feal', 0, -1)]
      num_pts = len(feallist)
      self.data['ctlIndexHead'] = thru_index
      thru_count = self.data['observe:count']

      logging.debug('##NUM_RMS_THIS_ROUND: %d', num_pts)
      stat.collect('numpts', len(feallist))


    # Calculate variable PDF estimations for each subspace via bootstrapping:
      logging.debug("=======================  <SUBSPACE CONVERGENCE>  =========================")

      # Bootstrap current sample for RMS
      logging.info("Feature Landscapes for %d points loaded. Calculating PDF.....", len(feallist))

      #  Static for now
      blocksize = 5000
      mv_convergence = op.bootstrap_block(feallist, blocksize)
      global_landscape = np.mean(feallist, axis=0)
      stat.collect('convergence', mv_convergence)
      stat.collect('globalfeal', global_landscape)
      # logging.info('MV Convergence values:\nCONV,%s', ','.join(['%5.3f'%i for i in mv_convergence]))
      # logging.info('Global Feature Landscape:\nFEAL,%s', ','.join(['%5.3f'%i for i in global_landscape]))
      logging.info('MV Convergence values:\nCONV,%s', str(mv_convergence[-1]))
      logging.info('Global Feature Landscape:\n%s', feal.tostring(global_landscape))

    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")

      ##### BARRIER
      self.wait_catalog()
      selected_index_list = []

      # QUERY PROCESSING & SAMPLING BELOW to select indices. 
      EXPERIMENT_NUMBER = self.experiment_number
      logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER)


      ###### EXPERIMENT #5:  BIASED (Umbrella) SAMPLER
      if EXPERIMENT_NUMBER == 5:
        if self.catalog.exists('label:deshaw'):
          logging.info("Loading DEShaw historical points.... From Catalog")
          rmslabel = [eval(x) for x in self.catalog.lrange('label:deshaw', 0, -1)]
        else:
          logging.info("Loading DEShaw historical points.... From File (and recalculating)")
          rmslabel = deshaw.labelDEShaw_rmsd()

        deshaw_samples = {b:[] for b in binlist}
        for i, b in enumerate(rmslabel):
          deshaw_samples[b].append(i)

        coord_origin = []

        conv_vals = np.array([v for k, v in sorted(convergence_rms.items())])
        norm_pdf_conv = conv_vals / sum(conv_vals)
        logging.info("Umbrella Samping PDF (Bootstrapping):")
        sampled_distro_perbin = {b: 0 for b in binlist}

        while numresources > 0:
          # First sampling is BIASED
          selected_bin = np.random.choice(len(binlist), p=norm_pdf_conv)
          A, B = binlist[selected_bin]
          sampled_distro_perbin[binlist[selected_bin]] += 1
          if bincounts[selected_bin] is not None and bincounts[selected_bin] > 0:
            # Secondary Sampling is Uniform
            sample_num = np.random.randint(bincounts[selected_bin])
            logging.debug('SAMPLER: selecting sample #%d from bin %s', 
              sample_num, str(binlist[selected_bin]))
            index = self.catalog.lindex('varbin:rms:%d_%d' % binlist[selected_bin], 
              sample_num)
            selected_index_list.append(index)
            coord_origin.append(('sim', index, binlist[selected_bin], '%d-D'%A))
            numresources -= 1
          elif len(deshaw_samples[binlist[selected_bin]]) > 0:
            index = np.random.choice(deshaw_samples[binlist[selected_bin]])
            logging.debug('SAMPLER: selecting DEShaw frame #%d from bin %s', 
              index, str(binlist[selected_bin]))
            # Negation indicates an historical index number
            selected_index_list.append(-index)
            coord_origin.append(('deshaw', index, binlist[selected_bin], '%d-D'%A))
            numresources -= 1
          else:
            logging.info("NO Candidates for bin: %s", binlist[selected_bin])



      ###### EXPERIMENT #10:  MutiVariate Nearest Neighbor (MVNN) SAMPLER
      if EXPERIMENT_NUMBER == 10:

        # Create the KD Tree from all feature landscapes (ignore first 5 features)
        kd = KDTree(100, 15, np.array(feallist), 'median')

        # Collect hypercubes
        hc = kd.getleaves()

        logging.info('KD Tree Stats')
        logging.info('    # HCubes   : %5d', len(hc))
        logging.info('    Largest  HC: %5d', max([v['count'] for k,v in hc.items()]))
        logging.info('    Smallest HC: %5d', min([v['count'] for k,v in hc.items()]))

        for key, hcube in hc.items():
          hc_feal = [feallist[i] for i in hcube['elm']]
          hc[key]['feal'] = np.mean(hc_feal, axis=0)

        #  Det scale and/or sep scales for each feature set
        desired = 10 - global_landscape
        logging.info('Desired Landscape:\n%s', feal.tostring(desired))

        #  Calc euclidean dist to each mean HC's feal
        nn = {k: LA.norm(desired[5:] - v['feal'][5:]) for k,v in hc.items()}

        #  Grab top N Neighbors (10 for now)
        neighbors = sorted(nn.items(), key=lambda x: x[1])[:10]
        logging.info('BestFit Landscape:\n%s', feal.tostring(hc[neighbors[0][0]]['feal']))

        ## DATA SAMPLER
        nn_keys = [i for i,w in neighbors]
        nn_wgts = np.array([w for i,w in neighbors])
        nn_wgts /= np.sum(nn_wgts)  # normalize

        coord_origin = []
        while numresources > 0:
          # First sampling is BIASED
          selected_hc = np.random.choice(nn_keys, p=nn_wgts)

          # Second is UNIFORM (within the HCube)
          index = np.random.choice(hc[selected_hc]['elm'])
          selected_index_list.append(index)
          src_state = np.argmax(feallist[index][:5])
          coord_origin.append(('sim', index, src_state, selected_hc))
          logging.info('Sampled Landscape [hc=%s]:\n%s', selected_hc, feal.tostring(feallist[index]))

          numresources -= 1


      elif EXPERIMENT_NUMBER == 11:

        #  Use only right most 10 features (non-normalized ones)
        inventory = np.array([f[10:] for f in feallist])
        desired = 10 - global_landscape
        logging.info('Desired Landscape (NOTE Only Including A-B values:\n%s', feal.tostring(desired))

        selected_index_list = mvkp.knapsack(desired[10:], inventory, numresources, 2000000)
        coord_origin = [('sim', index, np.argmax(feallist[index][:5]), 'D') for index in selected_index_list]
        logging.info("KNAPSACK Completed:")
        logging.info('Target Distribution:\n%s', str(desired[10:]))
        logging.info('Target Distribution:\n%s', '\n'.join(['%s'%feallist[i] for i in selected_index_list]))

    # Back Project to get new starting Coords for each sample  
      logging.debug("=======================  <INPUT PARAM GENERATION>  =================")
      logging.info('All Indices sampled. Back projecting to high dim coords')
      sampled_set = []
      for i in selected_index_list:
        traj = self.backProjection([i])
        sampled_set.append(traj)
      bench.mark('Sampler')

    # Generate new starting positions
      runtime = self.data['runtime']
      jcqueue = OrderedDict()
      for i, start_traj in enumerate(sampled_set):
        jcID, params = generateNewJC(start_traj)

        # TODO:  Update/check adaptive runtime, starting state
        jcConfig = dict(params,
              name    = jcID,
              runtime = runtime,     # In timesteps
              dcdfreq = self.data['dcdfreq'],           # Frame save rate
              interval = self.data['dcdfreq'] * self.data['sim_step_size'],                       
              temp    = 310,
              timestep = self.data['timestep'],
              gc      = 1,
              origin  = coord_origin[i][0],
              src_index = coord_origin[i][1],
              src_bin  = coord_origin[i][2],
              src_hcube = coord_origin[i][3],
              application   = settings.APPL_LABEL)

        logging.info("New Simulation Job Created: %s", jcID)
        for k, v in jcConfig.items():
          logging.debug("   %s:  %s", k, str(v))

        #  Add to the output queue & save config info
        jcqueue[jcID] = jcConfig
        logging.info("New Job Candidate Completed:  %s   #%d on the Queue", jcID, len(jcqueue))

      bench.mark('GenInputParams')

    #  POST-PROCESSING  -------------------------------------
      logging.debug("============================  <POST-PROCESSING & OUTPUT>  =============================")
          
      self.wait_catalog()

      # Clear current queue, mark previously queues jobs for GC, push new queue
      qlen = self.catalog.llen('jcqueue')
      logging.debug('Current queue len;   %s', str(qlen))
      if qlen > 0:
        curqueue = self.catalog.lrange('jcqueue', 0, -1)
        logging.info("Marking %d obsolete jobs for garbage collection", len(curqueue))
        for jc_key in curqueue:
          key = wrapKey('jc', jc_key)
          config = self.catalog.hgetall(key)
          config['gc'] = 0
          # Add gc jobs it to the state to write back to catalog (flags it for gc)
          self.addMut(key, config)
        self.catalog.delete('jcqueue')

      #  CATALOG UPDATES
      self.catalog.rpush('datacount', len(feallist))

      #  EXPR 7 Update:
      if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10:
        # self.catalog.storeNPArray(np.array(centroid), 'subspace:covar:centroid:%d' % cov_iteration)
        self.catalog.rpush('subspace:covar:thruindex', len(covar_pts))

      # Update cache hit/miss
      hit = self.cache_hit
      miss = self.cache_miss
      logging.info('##CACHE_HIT_MISS %d %d  %.3f', hit, miss, (hit)/(hit+miss))
      self.catalog.rpush('cache:hit', self.cache_hit)
      self.catalog.rpush('cache:miss', self.cache_miss)

      self.data['jcqueue'] = list(jcqueue.keys())

      logging.debug("   JCQUEUE:  %s", str(self.data['jcqueue']))
      # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y)
      logging.debug("Updated Job Queue length:  %d", len(self.data['jcqueue']))
      for jcid, config in jcqueue.items():
        # config['converge'] = self.data['converge']
        self.addMut(wrapKey('jc', jcid), config)
 
      bench.mark('PostProcessing')
      print ('## TS=%d' % self.data['timestep'])
      bench.show()
      stat.show()

      return list(jcqueue.keys())
예제 #3
0
파일: ctl_mv.py 프로젝트: DaMSL/ddc
    wells = [(0, 2099, 684),
            (1, 630, 602),
            (2, 2364, 737),
            (3, 3322, 188),
            (4, 2108, 258)]
    print('GEN DATA!')

    #  REDO WELLs
    for well in wells:
      state, win, frame = well

      logging.error(' NEED TO REDO Well start point gen')

      pdbfile, archiveFile = getHistoricalTrajectory(win)

      jcID, params = generateNewJC(archiveFile, ppdbfile, frame, jcid='test_%d'%state)

      jcConfig = dict(params,
          name    = jcID,
          runtime = 20000,
          interval = 500,
          temp    = 310,
          state   = state,
          weight  = 0.0,
          timestep = 0,
          gc      = 1,
          application   = DEFAULT.APPL_LABEL)

      print("Data Generated! ")
      print("Job = ", jcID)
      for k, v in jcConfig.items():
예제 #4
0
파일: initialize.py 프로젝트: DaMSL/ddc
def seedJob_Uniform(catalog, num=1, exact=None):
  """
  Seeds jobs into the JCQueue -- pulled from DEShaw
  Selects equal `num` of randomly start frames from each bin
  to seed as job candidates
  """
  logging.info('Seeding %d jobs per transtion bin', num)
  settings = systemsettings()
  numLabels = int(catalog.get('numLabels'))
  binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]

  dcdfreq = int(catalog.get('dcdfreq'))
  runtime = int(catalog.get('runtime'))
  sim_step_size = int(catalog.get('sim_step_size'))

  if catalog.exists('label:deshaw'):
    rmslabel = [eval(x) for x in catalog.lrange('label:deshaw', 0, -1)]
  elif os.path.exists(DESHAW_LABEL_FILE):
    logging.info('Loading DEShaw Points From File....')
    with open(DESHAW_LABEL_FILE) as lfile:
      rmslabel = [eval(label) for label in lfile.read().strip().split('\n')]
    logging.info('Loaded DEShaw %d Labels from file, %s', len(rmslabel), DESHAW_LABEL_FILE)
    pipe = catalog.pipeline()
    for rms in rmslabel:
      pipe.rpush('label:deshaw', rms)
    pipe.execute()
    logging.info('DEShaw Labels stored in the catalog.')
  else:
    rmslabel = labelDEShaw_rmsd(store_to_disk=True)
    pipe = catalog.pipeline()
    for rms in rmslabel:
      pipe.rpush('label:deshaw', rms)
    pipe.execute()
    logging.info('DEShaw Labels stored in the catalog.')
  logging.info('Grouping all prelabeled Data:')
  groupby = {b:[] for b in binlist}

  for i, b in enumerate(rmslabel):
    groupby[b].append(i)

  for k in sorted(groupby.keys()):
    v = groupby[k]
    logging.info('%s %7d %4.1f', str(k), len(v), (100*len(v)/len(rmslabel)))

  if exact is None:
    source_list = sorted(groupby.keys())
  else:
    bin_list = list(groupby.keys())
    if exact <= 25:
      idx_list = np.random.choice(len(bin_list), exact, replace=False)
    else:
      idx_list = np.random.choice(len(bin_list), exact, replace=True)
    source_list = [bin_list[i] for i in idx_list]

  for binlabel in source_list:
    clist = groupby[binlabel]
    A, B = binlabel

    # No candidates
    if len(clist) == 0:
      logging.info('NO Candidates for %s', str(binlabel))
      if binlabel == (1, 3):
        logging.info('Swapping (1,2) for (1,3)')
        clist = groupby[(1,2)]
        B = 2
      elif binlabel == (3, 1):
        logging.info('Swapping (3,0) for (3,1)')
        clist = groupby[(3,0)]
        B = 0
      else:
        logging.info('Not sampling this bin')
        continue

    for k in range(num):
      logging.debug('\nSeeding Job #%d for bin (%d,%d) ', k, A, B)
      index = np.random.choice(clist)
      src, frame = deshaw.refFromIndex(index)
      logging.debug("   Selected: BPTI %s, frame: %s", src, frame)
      pdbfile, dcdfile = deshaw.getHistoricalTrajectory_prot(int(src))
      traj = md.load(dcdfile, top=pdbfile, frame=int(frame))

      # Generate new set of params/coords
      jcID, params = generateNewJC(traj)

      # Update Additional JC Params and Decision History, as needed
      config = dict(params,
          name    = jcID,
          runtime = runtime,
          dcdfreq = dcdfreq,
          interval = dcdfreq * sim_step_size,                       
          temp    = 310,
          timestep = 0,
          gc      = 1,
          origin  = 'deshaw',
          src_index = index,
          src_bin  = (A, B),
          src_hcube = 'D',
          application   = settings.APPL_LABEL)
      logging.info("New Simulation Job Created: %s", jcID)
      for k, v in config.items():
        logging.debug("   %s:  %s", k, str(v))
      catalog.rpush('jcqueue', jcID)
      catalog.hmset(wrapKey('jc', jcID), config)