Exemplo n.º 1
0
def load_PCA_Subspace(catalog):

  # HCube leaf size of 500 points
  settings = systemsettings()
  vectfile = settings.PCA_VECTOR_FILE

  logging.info("Loading PCA Vectors from %s", vectfile)
  pc_vect = np.load(vectfile)
  max_pc = pc_vect.shape[1]
  num_pc = min(settings.PCA_NUMPC, max_pc)
  pc = pc_vect[:num_pc]
  logging.info("Storing PCA Vectors to key:  %s", 'pcaVectors')
  catalog.storeNPArray(pc, 'pcaVectors')

  logging.info("Loading Pre-Calculated PCA projections from Historical BPTI Trajectory")
  pre_calc_deshaw = np.load('data/pca_applied.npy')

  # Extract only nec'y PC's
  pts = pre_calc_deshaw.T[:num_pc].T

  pipe = catalog.pipeline()
  for si in pts:
    pipe.rpush('subspace:pca', bytes(si))
  pipe.execute()
  logging.debug("PCA Subspace stored in Catalog")

  logging.info('Creating KD Tree')
  kd = KDTree(500, maxdepth=8, data=pts)
  logging.info('Encoding KD Tree')
  packaged = kd.encode()
  encoded = json.dumps(packaged)
  logging.info('Storing in catalog')
  catalog.delete('hcube:pca')
  catalog.set('hcube:pca', encoded)
  logging.info('PCA Complete')
Exemplo n.º 2
0
  def doTree(self, pc=4, ktype='rbf', leafsize=100, maxdepth=6, split='middle'):
    diag = np.array([np.diag(i) for i in self.covar])
    kpca = PCAKernel(pc, ktype)
    kpca.solve(diag)
    gdata = kpca.project(diag)

    # Create KDTree over reduced-dim subspace (4-PCs)
    gtree  = KDTree(leafsize, maxdepth, gdata, split) # Or Max Gap
    self.hc = gtree.getleaves()
Exemplo n.º 3
0
def make_kdtree(feal_list):
  kdtree1 = KDTree(50, maxdepth=4, data=feal_list, method='median')
  hc1 = kdtree1.getleaves()
  for k, v in hc1.items():
    src_pts = []
    for i in v['elm']:
      a, b = tidx[i]
      src_pts.append(rms_val[a][b])
    print(k, np.mean(src_pts, axis=0))
Exemplo n.º 4
0
 def kdtree(self, leafsize, depth, method):
   self.index = []
   allpts = []
   # Recalc for indexing
   flist = [[self.feal_atemp(i) for i in self.rmsd[tr]] for tr in self.rmsd.keys()]
   for trnum, f in enumerate(flist):
     for i, tup in enumerate(f):
       allpts.append(tup[5:])
       self.index.append((trnum, i))
   self.kd = KDTree(leafsize, depth, np.array(allpts), method)
   self.hc = self.kd.getleaves()
Exemplo n.º 5
0
def reproj_distro():
    local = get_local()
    data = get_edges()
    r2 = redis.StrictRedis(port=6385, decode_responses=True)
    for tbin in [(2, 4), (2, 2), (4, 4), (4, 2), (4, 1), (3, 1)]:
        print('Processing:', tbin)
        tkey = '%d_%d' % tbin
        # Get Kernel
        kpca_key = 'subspace:pca:kernel:' + tkey
        kpca = PCAnalyzer.load(r2, kpca_key)
        # Get Training Data
        data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1)
        pca_pts = np.array([np.fromstring(x) for x in data_raw])
        kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle')
        proj_pts = kpca.project(alpha.xyz)
        biased_hcubes = []
        for i, pt in enumerate(proj_pts):
            biased_hcubes.append(kdtree.probe(pt, probedepth=9))
        if len(data) == 0:
            print('No Raw PCA data points for bin %s.... Going to next bin',
                  str(tbin))
            continue
        counts = {}
        for i in biased_hcubes:
            if i not in counts:
                counts[i] = 0
            counts[i] += 1
        for i in local[tkey]['keys']:
            if i not in counts:
                counts[i] = 0
        print('check')
        cvect = [counts[i] for i in local[tkey]['keys']]
        d = np.array(cvect) / sum(cvect)
        c = np.array(data[tkey])
        lcnt = np.sum(c, axis=0)
        gcnt = np.sum(c, axis=1)
        norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis])
        # Add biased data as a col
        kpca_cnt = np.array([int(i) for i in local[tkey]['count']])
        kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt)
        arr = np.vstack((norm, kpca_cnt_norm, d)).T
        rowlist = tuple(gcnt) + (
            'localKPCA',
            'biased',
        )
        P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])
Exemplo n.º 6
0
def calcDEShaw_PCA(catalog, force=False):
  numPC = 3

  numpts = catalog.llen('subspace:pca')
  if numpts == 0 or force:
    catalog.delete('subspace:pca')
    logging.debug("Projecting DEshaw PCA Vectors (assuming PC's are pre-calc'd")
    pcavect = catalog.loadNPArray('pcaVectors')
    logging.debug("Loaded PCA Vectors: %s, %s", str(pcavect.shape), str(pcavect.dtype))
    src = np.load(DESHAW_PTS_FILE)
    logging.debug("Loaded source points: %s, %s", str(src.shape), str(src.dtype))
    pcalist = np.zeros(shape=(len(src), numPC))
    start = dt.datetime.now()
    pdbfile, dcdfile = deshaw.getHistoricalTrajectory(0)
    traj = md.load(dcdfile, top=pdbfile, frame=0)
    filt = traj.top.select_atom_indices(selection='heavy')
    pipe = catalog.pipeline()
    for i, conform in enumerate(src):
      if i % 10000 == 0:
        logging.debug("Projecting: %d", i)
      heavy = np.array([conform[k] for k in filt])
      np.copyto(pcalist[i], np.array([np.dot(heavy.reshape(pc.shape),pc) for pc in pcavect[:numPC]]))
      raw_index = i * DESHAW_SAMPLE_FACTOR
      pipe.rpush('xid:reference', '(-1, %d)' % raw_index)
    end = dt.datetime.now()
    logging.debug("Projection time = %d", (end-start).seconds)

    rIdx = []
    for si in pcalist:
      rIdx.append(pipe.rpush('subspace:pca', bytes(si)))
    pipe.execute()
    logging.debug("R_Index Created (pca)")
  else:
    logging.info('PCA Already created. Retrieving existing lower dim pts')
    X = catalog.lrange('subspace:pca', 0, -1)
    pcalist = np.array([np.fromstring(si) for si in X])

  # HCube leaf size of 500 points
  logging.info('Creating KD Tree')
  kd = KDTree(500, data=pcalist)
  logging.info('Encoding KD Tree')
  encoded = json.dumps(kd.encode())
  logging.info('Storing in catalog')
  catalog.delete('hcube:pca')
  catalog.set('hcube:pca', encoded)
  logging.info('PCA Complete')
Exemplo n.º 7
0
def kpca_check(red_db, tbin='(0, 4)'):
  if isinstance(red_db, list):
    rlist = red_db
  else:
    rlist = [red_db]

  trajlist = []
  for r in rlist:
    print('Pulling data...')
    obslist = r.lrange('label:rms', 0, -1)
    idxlist = [i for i, o in enumerate(obslist) if o == tbin]
    traj = dh.backProjection(r, idxlist)
    alpha = datareduce.filter_alpha(traj)
    trajlist.append(alpha)

  deidx = lambda i: deidx_cutlist(i, [t.n_frames for t in trajlist])

  print('Kpca')
  kpca1 = PCAKernel(6, 'rbf')
  kpca1.solve(alpha.xyz)
  X = kpca1.project(alpha.xyz)
  print('KDTree1')
  kdtree1 = KDTree(50, maxdepth=4, data=X, method='median')
  hc1 = kdtree1.getleaves()

  srcidx = [[i[0] \
    for i in db.runquery("select idx from jc where bin='0_4' and expid=%d"%e)] \
    for e in range(32, 36)]

  src_traj = [dh.backProjection(r, i) for r, i in zip(rlist, srcidx)]
  src_xyz = [datareduce.filter_alpha(t).xyz for t in src_traj]
  probe_res = [[kdtree1.project(i.reshape(174,)) for i in xyz] for xyz in src_xyz]

  grp_src = []
  for p, s in zip(probe_res, srcidx):
      grp = {}
      for h, i in zip(p, s):
        if h not in grp:
          grp[h] = []
        grp[h].append(i)
      grp_src.append(grp)

  idx_se_map = [{i: (s, e) for i, s, e in db.runquery("select idx, start, end from jc where bin='0_4' and expid=%d"%eid)} for eid in range(32, 36)]
Exemplo n.º 8
0
def reproj_distro():
  local = get_local()
  data  = get_edges()
  r2 = redis.StrictRedis(port=6385, decode_responses=True)
  for tbin in [(2,4), (2,2), (4,4), (4,2), (4,1), (3,1)]:
    print('Processing:', tbin)
    tkey = '%d_%d' % tbin
    # Get Kernel
    kpca_key = 'subspace:pca:kernel:' + tkey
    kpca = PCAnalyzer.load(r2, kpca_key)
    # Get Training Data
    data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1)
    pca_pts = np.array([np.fromstring(x) for x in data_raw])
    kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle')
    proj_pts = kpca.project(alpha.xyz)
    biased_hcubes = []
    for i, pt in enumerate(proj_pts):
      biased_hcubes.append(kdtree.probe(pt, probedepth=9))
    if len(data) == 0:
      print('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
      continue
    counts = {}
    for i in biased_hcubes:
      if i not in counts:
        counts[i] = 0
      counts[i] += 1
    for i in local[tkey]['keys']:
      if i not in counts:
        counts[i] = 0
    print('check')
    cvect = [counts[i] for i in local[tkey]['keys']]
    d = np.array(cvect)/sum(cvect)
    c = np.array(data[tkey])
    lcnt = np.sum(c, axis=0)
    gcnt = np.sum(c, axis=1)
    norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis])
    # Add biased data as a col
    kpca_cnt = np.array([int(i) for i in local[tkey]['count']])
    kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt)
    arr = np.vstack((norm, kpca_cnt_norm, d)).T
    rowlist = tuple(gcnt) + ('localKPCA', 'biased',)
    P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])
Exemplo n.º 9
0
def high_low_check(r, tbin='(0, 4)'):
  print('Pulling data...')
  obslist=rb.lrange('label:rms', 0, -1)
  ob04 = [i for i, o in enumerate(obslist) if o == tbin]
  traj = backProjection(r, ob04)
  alpha = datareduce.filter_alpha(traj)
  print('Kpca')
  kpca1 = PCAKernel(6, 'rbf')
  kpca1.solve(alpha.xyz)
  X = kpca1.project(alpha.xyz)
  print('KDTree1')
  kdtree1 = KDTree(50, maxdepth=4, data=X, method='median')
  hc1 = kdtree1.getleaves()
  print('KDTree2')
  Y = alpha.xyz.reshape(alpha.n_frames, 174)
  kdtree2 = KDTree(50, maxdepth=4, data=Y, method='median')
  hc2 = kdtree2.getleaves()
  hc1k = sorted(hc1.keys())
  hc2k = sorted(hc2.keys())
  s1 = [set(hc1[k]['elm']) for k in hc1k]
  s2 = [set(hc2[k]['elm']) for k in hc2k]
  dd = np.zeros(shape=(len(s1), len(s2)))
  print('     ', ' '.join(hc1k))
  for i, a in enumerate(s1):
    print('  ' +hc1k[i], end=' ')
    for j, b in enumerate(s2):
      n = len(a & b)
      print('%4d'%n, end=' ')
      dd[i][j] = n
    print('\n', end=' ')
  return dd
Exemplo n.º 10
0
Arquivo: anl.py Projeto: DaMSL/ddc
    def execute(self, jobkey):
        bench = microbench()
        config = self.data[jobkey]
        jobname = unwrapKey(jobkey)

        # RETRIEVE from Cache   --> Local Mem:
        allux = AlluxioClient()

        trajfile = config['name'] + '.dcd'
        # >>>>Storing DCD into shared memory on this node
        ramdisk = '/dev/shm/out/'
        if not os.path.exists(ramdisk):
            os.mkdir(ramdisk)
        logging.info("Created ramdisk at: %s", ramdisk)
        bench.mark('mk_ramdisk')
        allux.cp(trajfile, ramdisk)
        bench.mark('cpFromAllux')
        logging.info("Copied to shared mem: %s", os.listdir(ramdisk))
        dcd_ramfile = os.path.join(ramdisk, trajfile)

        # 1. Check if source files exist
        # logging.debug("1. Check for file")
        # if not (os.path.exists(config['dcd']) and os.path.exists(config['pdb'])):
        #   logging.error('Source Files not found: %s, %s', config['dcd'], config['pdb'])
        #   return []

        # 2. Load raw data from trajectory file
        logging.debug("2. Load DCD")
        # bench.start()
        # traj = datareduce.filter_heavy(config['dcd'], config['pdb'])
        # load from Local Mem Now
        traj = datareduce.filter_heavy(dcd_ramfile, config['pdb'])
        bench.mark('File_Load')
        logging.debug('Trajectory Loaded: %s (%s)', config['name'], str(traj))

        # 3. Update Catalog with HD points (TODO: cache this)
        #  TODO: Pipeline all
        # off-by-1: append list returns size (not inserted index)
        #  ADD index to catalog
        # Off by 1 error for index values
        file_idx = self.catalog.append({'xid:filelist': [config['dcd']]})[0]
        delta_xid_index = [(file_idx - 1, x) for x in range(traj.n_frames)]
        global_idx = self.catalog.append({'xid:reference': delta_xid_index})
        global_xid_index_slice = [x - 1 for x in global_idx]
        bench.mark('Indx_Update')

        # 4. Update higher dimensional index
        # Logical Sequence # should be unique seq # derived from manager (provides this
        #  worker's instantiation with a unique ID for indexing)
        mylogical_seqnum = str(self.seqNumFromID())
        self.catalog.hset('anl_sequence', config['name'], mylogical_seqnum)

        # INSERT NEW points here into cache/archive
        logging.debug(
            " Loading new conformations into cache....TODO: NEED CACHE LOC")
        # for i in range(traj.n_frames):
        #   cache.insert(global_xid_index_slice[i], traj.xyz[i])

        # 5a. Subspace Calcuation: RMS
        #------ A:  RMSD  ------------------
        #     S_A = rmslist

        # 1. Calc RMS for each conform to all centroids
        logging.debug("3. RMS Calculation")
        numLabels = len(self.data['centroid'])
        numConf = len(traj.xyz)
        rmsraw = np.zeros(shape=(numConf, numLabels))
        for i, conform in enumerate(traj.xyz):
            np.copyto(
                rmsraw[i],
                np.array([
                    LA.norm(conform - cent) for cent in self.data['centroid']
                ]))
        logging.debug('  RMS:  %d points projected to %d centroid-distances',
                      numConf, numLabels)

        # 2. Account for noise
        #    For now: noise is user-configured; TODO: Factor in to Kalman Filter
        noise = DEFAULT.OBS_NOISE
        stepsize = 500 if 'interval' not in config else int(config['interval'])
        nwidth = noise // (2 * stepsize)
        noisefilt = lambda x, i: np.mean(
            x[max(0, i - nwidth):min(i + nwidth, len(x))], axis=0)

        # Notes: Delta_S == rmslist
        rmslist = np.array([noisefilt(rmsraw, i) for i in range(numConf)])

        # 3. Append new points into the data store.
        #    r_idx is the returned list of indices for each new RMS point
        #  TODO: DECIDE on retaining a Reservoir Sample
        #    for each bin OR to just cache all points (per strata)
        #  Reservoir Sampliing is Stratified by subspaceHash
        # logging.debug('Creating reservoir Sample')
        # reservoirSampling(self.catalog, traj.xyz, rIdx, subspaceHash,
        #     lambda x: tuple([x]+list(traj.xyz.shape[1:])),
        #     'rms', lambda key: '%d_%d' % key)
        r_idx = []
        pipe = self.catalog.pipeline()
        for si in rmslist:
            pipe.rpush('subspace:rms', bytes(si))
        idxlist = pipe.execute()
        for i in idxlist:
            r_idx.append(int(i) - 1)

        logging.debug("R_Index Created (rms).")

        # 4. Apply Heuristics Labeling
        logging.debug('Applying Labeling Heuristic')
        rmslabel = []
        subspaceHash = {}
        for i, rms in enumerate(rmslist):
            #  Sort RMSD by proximity & set state A as nearest state's centroid
            prox = np.argsort(rms)
            A = prox[0]

            #  Calc Absolute proximity between nearest 2 states' centroids
            # THETA Calc derived from static run. it is based from the average std dev of all rms's from a static run
            #   of BPTI without solvent. It could be dynamically calculated, but is hard coded here
            #  The theta is divided by four based on the analysis of DEShaw:
            #   est based on ~3% of DEShaw data in transition (hence )
            avg_stddev = 0.34119404492089034
            theta = avg_stddev / 4.

            # NOTE: Original formulate was relative. Retained here for reference:
            # Rel vs Abs: Calc relative proximity for top 2 nearest centroids
            # relproximity = rms[A] / (rms[A] + rms[rs[1]])
            # B = rs[1] if relproximity > (.5 - theta) else A
            # proximity = abs(rms[prox[1]] - rms[A]) / (rms[prox[1]] + rms[A])  #relative
            proximity = abs(rms[prox[1]] - rms[A])  #abs

            #  (TODO:  Factor in more than top 2, better noise)
            #  Label secondary sub-state
            B = prox[1] if proximity < theta else A
            rmslabel.append((A, B))
            if (A, B) not in subspaceHash:
                subspaceHash[(A, B)] = []
                logging.debug("Found Label: %s", str((A, B)))
            subspaceHash[(A, B)].append(i)

        # Update Catalog
        idxcheck = self.catalog.append({'label:rms': rmslabel})
        bench.mark('RMS')

        # 5b. Subspace Calcuation: PCA
        #------ B:  PCA  -----------------
        # 1. Project Pt to PC's for each conform (top 3 PC's)
        logging.debug("Using following PCA Vectors: %s",
                      str(self.data['pcaVectors'].shape))
        pcalist = datareduce.PCA(traj.xyz, self.data['pcaVectors'], numpc=3)

        # 2. Apend subspace in catalog
        p_idx = []
        pipe = self.catalog.pipeline()
        for si in pcalist:
            pipe.rpush('subspace:pca', bytes(si))
        idxlist = pipe.execute()
        for i in idxlist:
            p_idx.append(int(i) - 1)
        logging.debug("P_Index Created (pca) for delta_S_pca")

        # 3. Performing tiling over subspace
        #   For Now: Load entire tree into local memory
        hcube_mapping = json.loads(self.catalog.get('hcube:pca'))
        logging.debug('# Loaded keys = %d', len(hcube_mapping.keys()))

        # 4. Pull entire Subspace (for now)
        #   Note: this is more efficient than inserting new points
        #   due to underlying Redis Insertions / Index look up
        #   If this become a bottleneck, may need to write custom redis client
        #   connection to persist connection and keep socket open (on both ends)
        #   Or handle some of this I/O server-side via Lua scipt
        packed_subspace = self.catalog.lrange('subspace:pca', 0, -1)
        subspace_pca = np.array([np.fromstring(x) for x in packed_subspace])

        # TODO: accessor function is for 1 point (i) and 1 axis (j).
        #  Optimize by changing to pipeline  retrieval for all points given
        #  a list of indices with an axis (if nec'y)
        logging.debug("Reconstructing the tree...")
        hcube_tree = KDTree.reconstruct(hcube_mapping, subspace_pca)

        # logging.debug("Inserting Delta_S_pca into KDtree (hcubes)")
        # for i in range(len(pcalist)):
        #   hcube_tree.insert(pcalist[i], p_idx[i])

        # TODO: Ensure hcube_tree is written to catalog
        # TODO: DECIDE on retaining a Reservoir Sample
        # reservoirSampling(self.catalog, traj.xyz, r_idx, subspaceHash,
        #     lambda x: tuple([x]+list(traj.xyz.shape[1:])),
        #     'pca',
        #     lambda key: '%d_%d' % key)
        bench.mark('PCA')

        bench.show()
        return [config['name']]
Exemplo n.º 11
0
Arquivo: ctl_mv.py Projeto: DaMSL/ddc
    def execute(self, thru_index):
      """Executing the Controler Algorithm. Load pre-analyzed lower dimensional
      subspaces, process user query and identify the sampling space with 
      corresponding distribution function for each user query. Calculate 
      convergence rates, run sampler, and then execute fairness policy to
      distribute resources among users' sampled values.
      """
      logging.debug('CTL MT')

    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")

      np.set_printoptions(precision=4, linewidth=150)

      self.data['timestep'] += 1
      logging.info('TIMESTEP: %d', self.data['timestep'])

      settings = systemsettings()
      bench = microbench('ctl_%s' % settings.name, self.seqNumFromID())
      stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID())

      # Connect to the cache
      self.cacheclient = CacheClient(settings.APPL_LABEL)

      # create the "binlist":
      numLabels = self.data['numLabels']
      numresources = self.data['numresources']


    # LOAD all new subspaces (?) and values

      ##### BARRIER
      self.wait_catalog()

      # Load new RMS Labels -- load all for now
      bench.start()
      logging.debug('Loading RMS Labels')
      start_index = max(0, self.data['ctlIndexHead'])

      # labeled_pts_rms = self.catalog.lrange('label:rms', self.data['ctlIndexHead'], thru_index)
      logging.debug(" Start_index=%d,  thru_index=%d,   ctlIndexHead=%d", start_index, thru_index, self.data['ctlIndexHead'])

      feallist = [np.fromstring(i) for i in self.catalog.lrange('subspace:feal', 0, -1)]
      num_pts = len(feallist)
      self.data['ctlIndexHead'] = thru_index
      thru_count = self.data['observe:count']

      logging.debug('##NUM_RMS_THIS_ROUND: %d', num_pts)
      stat.collect('numpts', len(feallist))


    # Calculate variable PDF estimations for each subspace via bootstrapping:
      logging.debug("=======================  <SUBSPACE CONVERGENCE>  =========================")

      # Bootstrap current sample for RMS
      logging.info("Feature Landscapes for %d points loaded. Calculating PDF.....", len(feallist))

      #  Static for now
      blocksize = 5000
      mv_convergence = op.bootstrap_block(feallist, blocksize)
      global_landscape = np.mean(feallist, axis=0)
      stat.collect('convergence', mv_convergence)
      stat.collect('globalfeal', global_landscape)
      # logging.info('MV Convergence values:\nCONV,%s', ','.join(['%5.3f'%i for i in mv_convergence]))
      # logging.info('Global Feature Landscape:\nFEAL,%s', ','.join(['%5.3f'%i for i in global_landscape]))
      logging.info('MV Convergence values:\nCONV,%s', str(mv_convergence[-1]))
      logging.info('Global Feature Landscape:\n%s', feal.tostring(global_landscape))

    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")

      ##### BARRIER
      self.wait_catalog()
      selected_index_list = []

      # QUERY PROCESSING & SAMPLING BELOW to select indices. 
      EXPERIMENT_NUMBER = self.experiment_number
      logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER)


      ###### EXPERIMENT #5:  BIASED (Umbrella) SAMPLER
      if EXPERIMENT_NUMBER == 5:
        if self.catalog.exists('label:deshaw'):
          logging.info("Loading DEShaw historical points.... From Catalog")
          rmslabel = [eval(x) for x in self.catalog.lrange('label:deshaw', 0, -1)]
        else:
          logging.info("Loading DEShaw historical points.... From File (and recalculating)")
          rmslabel = deshaw.labelDEShaw_rmsd()

        deshaw_samples = {b:[] for b in binlist}
        for i, b in enumerate(rmslabel):
          deshaw_samples[b].append(i)

        coord_origin = []

        conv_vals = np.array([v for k, v in sorted(convergence_rms.items())])
        norm_pdf_conv = conv_vals / sum(conv_vals)
        logging.info("Umbrella Samping PDF (Bootstrapping):")
        sampled_distro_perbin = {b: 0 for b in binlist}

        while numresources > 0:
          # First sampling is BIASED
          selected_bin = np.random.choice(len(binlist), p=norm_pdf_conv)
          A, B = binlist[selected_bin]
          sampled_distro_perbin[binlist[selected_bin]] += 1
          if bincounts[selected_bin] is not None and bincounts[selected_bin] > 0:
            # Secondary Sampling is Uniform
            sample_num = np.random.randint(bincounts[selected_bin])
            logging.debug('SAMPLER: selecting sample #%d from bin %s', 
              sample_num, str(binlist[selected_bin]))
            index = self.catalog.lindex('varbin:rms:%d_%d' % binlist[selected_bin], 
              sample_num)
            selected_index_list.append(index)
            coord_origin.append(('sim', index, binlist[selected_bin], '%d-D'%A))
            numresources -= 1
          elif len(deshaw_samples[binlist[selected_bin]]) > 0:
            index = np.random.choice(deshaw_samples[binlist[selected_bin]])
            logging.debug('SAMPLER: selecting DEShaw frame #%d from bin %s', 
              index, str(binlist[selected_bin]))
            # Negation indicates an historical index number
            selected_index_list.append(-index)
            coord_origin.append(('deshaw', index, binlist[selected_bin], '%d-D'%A))
            numresources -= 1
          else:
            logging.info("NO Candidates for bin: %s", binlist[selected_bin])



      ###### EXPERIMENT #10:  MutiVariate Nearest Neighbor (MVNN) SAMPLER
      if EXPERIMENT_NUMBER == 10:

        # Create the KD Tree from all feature landscapes (ignore first 5 features)
        kd = KDTree(100, 15, np.array(feallist), 'median')

        # Collect hypercubes
        hc = kd.getleaves()

        logging.info('KD Tree Stats')
        logging.info('    # HCubes   : %5d', len(hc))
        logging.info('    Largest  HC: %5d', max([v['count'] for k,v in hc.items()]))
        logging.info('    Smallest HC: %5d', min([v['count'] for k,v in hc.items()]))

        for key, hcube in hc.items():
          hc_feal = [feallist[i] for i in hcube['elm']]
          hc[key]['feal'] = np.mean(hc_feal, axis=0)

        #  Det scale and/or sep scales for each feature set
        desired = 10 - global_landscape
        logging.info('Desired Landscape:\n%s', feal.tostring(desired))

        #  Calc euclidean dist to each mean HC's feal
        nn = {k: LA.norm(desired[5:] - v['feal'][5:]) for k,v in hc.items()}

        #  Grab top N Neighbors (10 for now)
        neighbors = sorted(nn.items(), key=lambda x: x[1])[:10]
        logging.info('BestFit Landscape:\n%s', feal.tostring(hc[neighbors[0][0]]['feal']))

        ## DATA SAMPLER
        nn_keys = [i for i,w in neighbors]
        nn_wgts = np.array([w for i,w in neighbors])
        nn_wgts /= np.sum(nn_wgts)  # normalize

        coord_origin = []
        while numresources > 0:
          # First sampling is BIASED
          selected_hc = np.random.choice(nn_keys, p=nn_wgts)

          # Second is UNIFORM (within the HCube)
          index = np.random.choice(hc[selected_hc]['elm'])
          selected_index_list.append(index)
          src_state = np.argmax(feallist[index][:5])
          coord_origin.append(('sim', index, src_state, selected_hc))
          logging.info('Sampled Landscape [hc=%s]:\n%s', selected_hc, feal.tostring(feallist[index]))

          numresources -= 1


      elif EXPERIMENT_NUMBER == 11:

        #  Use only right most 10 features (non-normalized ones)
        inventory = np.array([f[10:] for f in feallist])
        desired = 10 - global_landscape
        logging.info('Desired Landscape (NOTE Only Including A-B values:\n%s', feal.tostring(desired))

        selected_index_list = mvkp.knapsack(desired[10:], inventory, numresources, 2000000)
        coord_origin = [('sim', index, np.argmax(feallist[index][:5]), 'D') for index in selected_index_list]
        logging.info("KNAPSACK Completed:")
        logging.info('Target Distribution:\n%s', str(desired[10:]))
        logging.info('Target Distribution:\n%s', '\n'.join(['%s'%feallist[i] for i in selected_index_list]))

    # Back Project to get new starting Coords for each sample  
      logging.debug("=======================  <INPUT PARAM GENERATION>  =================")
      logging.info('All Indices sampled. Back projecting to high dim coords')
      sampled_set = []
      for i in selected_index_list:
        traj = self.backProjection([i])
        sampled_set.append(traj)
      bench.mark('Sampler')

    # Generate new starting positions
      runtime = self.data['runtime']
      jcqueue = OrderedDict()
      for i, start_traj in enumerate(sampled_set):
        jcID, params = generateNewJC(start_traj)

        # TODO:  Update/check adaptive runtime, starting state
        jcConfig = dict(params,
              name    = jcID,
              runtime = runtime,     # In timesteps
              dcdfreq = self.data['dcdfreq'],           # Frame save rate
              interval = self.data['dcdfreq'] * self.data['sim_step_size'],                       
              temp    = 310,
              timestep = self.data['timestep'],
              gc      = 1,
              origin  = coord_origin[i][0],
              src_index = coord_origin[i][1],
              src_bin  = coord_origin[i][2],
              src_hcube = coord_origin[i][3],
              application   = settings.APPL_LABEL)

        logging.info("New Simulation Job Created: %s", jcID)
        for k, v in jcConfig.items():
          logging.debug("   %s:  %s", k, str(v))

        #  Add to the output queue & save config info
        jcqueue[jcID] = jcConfig
        logging.info("New Job Candidate Completed:  %s   #%d on the Queue", jcID, len(jcqueue))

      bench.mark('GenInputParams')

    #  POST-PROCESSING  -------------------------------------
      logging.debug("============================  <POST-PROCESSING & OUTPUT>  =============================")
          
      self.wait_catalog()

      # Clear current queue, mark previously queues jobs for GC, push new queue
      qlen = self.catalog.llen('jcqueue')
      logging.debug('Current queue len;   %s', str(qlen))
      if qlen > 0:
        curqueue = self.catalog.lrange('jcqueue', 0, -1)
        logging.info("Marking %d obsolete jobs for garbage collection", len(curqueue))
        for jc_key in curqueue:
          key = wrapKey('jc', jc_key)
          config = self.catalog.hgetall(key)
          config['gc'] = 0
          # Add gc jobs it to the state to write back to catalog (flags it for gc)
          self.addMut(key, config)
        self.catalog.delete('jcqueue')

      #  CATALOG UPDATES
      self.catalog.rpush('datacount', len(feallist))

      #  EXPR 7 Update:
      if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10:
        # self.catalog.storeNPArray(np.array(centroid), 'subspace:covar:centroid:%d' % cov_iteration)
        self.catalog.rpush('subspace:covar:thruindex', len(covar_pts))

      # Update cache hit/miss
      hit = self.cache_hit
      miss = self.cache_miss
      logging.info('##CACHE_HIT_MISS %d %d  %.3f', hit, miss, (hit)/(hit+miss))
      self.catalog.rpush('cache:hit', self.cache_hit)
      self.catalog.rpush('cache:miss', self.cache_miss)

      self.data['jcqueue'] = list(jcqueue.keys())

      logging.debug("   JCQUEUE:  %s", str(self.data['jcqueue']))
      # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y)
      logging.debug("Updated Job Queue length:  %d", len(self.data['jcqueue']))
      for jcid, config in jcqueue.items():
        # config['converge'] = self.data['converge']
        self.addMut(wrapKey('jc', jcid), config)
 
      bench.mark('PostProcessing')
      print ('## TS=%d' % self.data['timestep'])
      bench.show()
      stat.show()

      return list(jcqueue.keys())
Exemplo n.º 12
0
class ExprAnl:
  def __init__(self, host='localhost', port=6379, adaptive_cent=False):
    self.r = redis.StrictRedis(port=port, host=host, decode_responses=True)
    # self.rms = [np.fromstring(i) for i in self.r.lrange('subspace:rms', 0, -1)]
    self.seq = ['jc_'+x[0] for x in sorted(self.r.hgetall('anl_sequence').items(), key=lambda x: int(x[1]))]
    self.conf=[self.r.hgetall(i) for i in self.seq]
    self.trlist = {}
    self.wells = [[] for i in range(5)]
    self.rmsd_c = {}
    self.rmsd_cw = {}
    self.rmsd_ds = {}
    self.rmsd_dsw = {}
    self.feal_list = None
    self.checked_rmsd = []
    self.cent_ds = np.load('../bpti-alpha-dist-centroid.npy')
    # self.cent_c = np.load('../data/gen-alpha-cartesian-centroid.npy')
    self.cent_c = np.load('../data/init-centroids.npy')
    self.cw = [.92, .92, .96, .99, .99]  

    if adaptive_cent:
      pass

  def loadtraj(self, tr, first=None):
    if isinstance(tr, list):
      trlist = tr
    else:
      trlist = [tr]
    for t in trlist:
      traj = md.load(self.conf[t]['dcd'], top=self.conf[t]['pdb'])
      # traj.center_coordinates()
      if first is not None:
        traj = traj.slice(np.arange(first))
      alpha = datareduce.filter_alpha(traj)
      # alpha.superpose(deshaw.topo_alpha)
      self.trlist[t] = alpha

  def ld_wells(self):
    for x, i in enumerate(self.conf):
      if i['origin'] == 'deshaw':
        A, B = eval(i['src_bin'])
        if A == B:
          traj = md.load(self.conf[A]['dcd'], top=self.conf[A]['pdb'])
          traj.center_coordinates()
          alpha = dr.filter_alpha(traj)
          maxf = min(1000, alpha.n_frames)
          for i in alpha.xyz[:maxf]:
            self.wells[A].append(i)

  def load(self, num, first=None):
    for i in range(num):
      _ = self.rms(i, first)

  def rms(self, trnum, noise=False, force=False, first=None):
    if trnum not in self.trlist.keys():
      self.loadtraj(trnum, first)
    if trnum not in self.rmsd_c.keys() or force:
      if noise:
        # With Noise Filter
        noise = int(self.r.get('obs_noise'))
        dcdfreq = int(self.r.get('dcdfreq'))
        stepsize = int(self.r.get('sim_step_size'))
        nwidth = noise//(2*stepsize)
        noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0)
        source_pts = np.array([noisefilt(self.trlist[trnum].xyz, i) for i in range(self.trlist[trnum].n_frames)])
      else:
        source_pts = self.trlist[trnum].xyz
      # self.rmsd_c[trnum] = [[LA.norm(self.cent_c[i]-pt) for i in range(5)] for pt in source_pts.xyz]
      self.rmsd_cw[trnum] = [[self.cw[i]*LA.norm(self.cent_c[i]-pt) for i in range(5)] for pt in source_pts]
      # ds = dr.distance_space(source_pts)
      # self.rmsd_ds[trnum] = [[LA.norm(self.cent_ds[i]-pt) for i in range(5)] for pt in ds]
      # self.rmsd_dsw[trnum] = [[self.cw[i]*LA.norm(self.cent_ds[i]-pt) for i in range(5)] for pt in ds]
    return self.rmsd_cw[trnum]

  def feal(self, trnum, winsize=None, var=False):
    feal_list = []
    var_list = []
    N = len(self.rmsd[trnum])
    wsize = N if winsize is None else winsize
    if trnum not in self.rmsd.keys():
      _ = self.rms(trnum)
    for idx in range(0, N, wsize):
      window = self.rmsd[trnum][idx:min(N,idx+wsize)]
      if var:
        f, v = self.feature_temporal(window, var=True)
        var_list.append(v)
      else:
        f = self.feature_temporal(window)
      feal_list.append(f)
    if var:
      return np.array(feal_list), np.array(var_list)
    return np.array(feal_list)

  @classmethod
  def feal_atemp(cls, rms, scaleto=10):
    """Atemporal (individual frame) featue landscape
    """
    log_reld = op.makeLogisticFunc(scaleto, -3, 0)

    # Counts  (feature 0..4)
    fealand = [0 for i in range(5)]
    fealand[np.argmin(rms)] = scaleto
    tup = []

    # Proximity (feature 5..9)
    # Normalized and adjusted to smooth implicit water
    A, B = np.argsort(rms)[:2]
    prox = op.makeLogisticFunc(scaleto, scaleto, (rms[B]+rms[A])/2)
    for d in rms:
      tup.append(prox(d))

    # Relative Distance (akin to LLE) (feature 10..19)
    for a in range(4):
      for b in range(a+1, 5):
        rel_dist = rms[a]-rms[b]
        tup.append(log_reld(rel_dist))

    fealand.extend(tup)

    # Additional Feature Spaces Would go here
    return np.array(fealand)   # Tuple or NDArray?

  def feature_temporal(self, window, var=False, scaleto=10):
    """ FEATURE LANDSCAPE Calculation for traj f data pts
    """
    landscape = [self.feal_atemp(rms) for rms in window]
    meanfeal = np.mean(landscape, axis=0)
    if var:
      variance = [0 for i in range(5)]
      variance.extend(np.std(tup[5:] for tup in landscape), axis=0)
      return np.array(meanfeal), np.array(variance)
    else:
      return np.array(meanfeal)

  def all_feal(self, force=False, method='cw'):
    # if method == 'c':
    #   rmsd = self.rmsd_c
    # elif method == 'cw':
    #   rmsd = self.rmsd_cw
    # elif method == 'ds':
    #   rmsd = self.rmsd_ds
    # else:
    #   rmsd = self.rmsd_dsw
    rmsd = self.rmsd_cw
    if self.feal_list is None or force:
      self.feal_list = op.flatten([[self.feal_atemp(i) for i in rmsd[tr]] for tr in rmsd.keys()])    
    return self.feal_list

  def feal_global(self):
    flist = self.all_feal()
    return np.mean(flist, axis=0)


  def bootstrap(self, size):
    feal = self.all_feal()
    i = 0
    boot = []
    while i+size < len(feal):
      print(i)
      boot.append(op.bootstrap_block(feal[:i+size], size))
      i += size
    return boot

  def draw_feal(self, trnum=None, norm=10):
    if trnum is None:
      flist = self.all_feal()
      agg = np.mean(flist, axis=0)
      P.feadist(agg, 'feal_global_%s' % self.r.get('name'), norm=norm)
    else:
      flist = [self.feal_atemp(i, scaleto=norm) for i in self.rmsd[trnum]]
      agg = np.mean(flist, axis=0)
      P.feadist(agg, 'feal_global_%s_%d' % (self.r.get('name'), trnum), norm=norm)

  def kdtree(self, leafsize, depth, method):
    self.index = []
    allpts = []
    # Recalc for indexing
    flist = [[self.feal_atemp(i) for i in self.rmsd[tr]] for tr in self.rmsd.keys()]
    for trnum, f in enumerate(flist):
      for i, tup in enumerate(f):
        allpts.append(tup[5:])
        self.index.append((trnum, i))
    self.kd = KDTree(leafsize, depth, np.array(allpts), method)
    self.hc = self.kd.getleaves()

  def hcmean(self, hckey=None):
    if hckey is None:
      flist = [[self.feal_atemp(i) for i in self.rmsd[tr]] for tr in self.rmsd.keys()]      
      result = {}
      for k, v in self.hc.items():
        hc_feal = []
        for idx in v['elm']:
          trnum, frame = self.index[int(idx)]
          hc_feal.append(flist[trnum][frame])
        result[k] = np.mean(hc_feal, axis=0)
      return result
    else:
      flist = []
      for idx in v['elm']:
        trnum, frame = self.index[idx]
        flist.append(self.feal_list[trnum][frame])
      return np.mean(flist, axis=0)
   
  ####  FEATURE LANDSCAPE Temporal Data Analysis/Study
  def make_covar(self, win, slide):
    covar = []
    WIN_SIZE_NS = win
    SLIDE_AMT_NS = slide
    for i, tr in self.trlist.items():
      if i % 100==0:
        print(i)
      cov = datareduce.calc_covar(tr.xyz, WIN_SIZE_NS, 1, slide=SLIDE_AMT_NS)
      covar.extend(cov)
    return covar
Exemplo n.º 13
0
    pd = {s: 0 for s in slist}
    for o in obs:
      pd[o] += 1
    for s, v in pd.items():
      hc_pdf[hc][s].append(v)

mn04 = {hc: {s: np.mean(i) for s, i in x.items()} for hc, x in hc_pdf.items()}
er04 = {hc: {s: np.std(i) for s, i in x.items()} for hc, x in hc_pdf.items()}

mn04_n = {e: {k: v/np.sum(list(l.values())) for k,v in l.items()} for e, l in mn04.items()}
er04_n = {e: {k: v/np.sum(list(l.values())) for k,v in l.items()}  for e, l in er04.items()}


  print('KDTree2')
  Y = alpha.xyz.reshape(alpha.n_frames, 174)
  kdtree2 = KDTree(50, maxdepth=4, data=Y, method='median')
  hc2 = kdtree2.getleaves()
  hc1k = sorted(hc1.keys())
  hc2k = sorted(hc2.keys())
  s1 = [set(hc1[k]['elm']) for k in hc1k]
  s2 = [set(hc2[k]['elm']) for k in hc2k]
  dd = np.zeros(shape=(len(s1), len(s2)))
  print('     ', ' '.join(hc1k))
  for i, a in enumerate(s1):
    print('  ' +hc1k[i], end=' ')
    for j, b in enumerate(s2):
      n = len(a & b)
      print('%4d'%n, end=' ')
      dd[i][j] = n
    print('\n', end=' ')
  return dd
Exemplo n.º 14
0
Arquivo: anl.py Projeto: DaMSL/ddc
    def execute(self, jobkey):
      bench = microbench()
      config = self.data[jobkey]
      jobname = unwrapKey(jobkey)

      # RETRIEVE from Cache   --> Local Mem:
      allux = AlluxioClient()

      trajfile = config['name'] + '.dcd'
      # >>>>Storing DCD into shared memory on this node
      ramdisk = '/dev/shm/out/'
      if not os.path.exists(ramdisk):
        os.mkdir(ramdisk)
      logging.info("Created ramdisk at: %s", ramdisk)
      bench.mark('mk_ramdisk')
      allux.cp(trajfile, ramdisk)
      bench.mark('cpFromAllux')
      logging.info("Copied to shared mem: %s", os.listdir(ramdisk))
      dcd_ramfile = os.path.join(ramdisk, trajfile)


    # 1. Check if source files exist
      # logging.debug("1. Check for file")
      # if not (os.path.exists(config['dcd']) and os.path.exists(config['pdb'])):
      #   logging.error('Source Files not found: %s, %s', config['dcd'], config['pdb'])
      #   return []

    # 2. Load raw data from trajectory file
      logging.debug("2. Load DCD")
      # bench.start()
      # traj = datareduce.filter_heavy(config['dcd'], config['pdb'])
      # load from Local Mem Now
      traj = datareduce.filter_heavy(dcd_ramfile, config['pdb'])
      bench.mark('File_Load')
      logging.debug('Trajectory Loaded: %s (%s)', config['name'], str(traj))

    # 3. Update Catalog with HD points (TODO: cache this)
      #  TODO: Pipeline all
      # off-by-1: append list returns size (not inserted index)
      #  ADD index to catalog
      # Off by 1 error for index values
      file_idx = self.catalog.append({'xid:filelist': [config['dcd']]})[0]
      delta_xid_index = [(file_idx-1, x) for x in range(traj.n_frames)]
      global_idx = self.catalog.append({'xid:reference': delta_xid_index})
      global_xid_index_slice = [x-1 for x in global_idx]
      bench.mark('Indx_Update')


    # 4. Update higher dimensional index
      # Logical Sequence # should be unique seq # derived from manager (provides this
      #  worker's instantiation with a unique ID for indexing)
      mylogical_seqnum = str(self.seqNumFromID())
      self.catalog.hset('anl_sequence', config['name'], mylogical_seqnum)

      # INSERT NEW points here into cache/archive
      logging.debug(" Loading new conformations into cache....TODO: NEED CACHE LOC")
      # for i in range(traj.n_frames):
      #   cache.insert(global_xid_index_slice[i], traj.xyz[i])

    # 5a. Subspace Calcuation: RMS
    #------ A:  RMSD  ------------------
      #     S_A = rmslist

      # 1. Calc RMS for each conform to all centroids
      logging.debug("3. RMS Calculation")
      numLabels = len(self.data['centroid'])
      numConf = len(traj.xyz)
      rmsraw = np.zeros(shape=(numConf, numLabels))
      for i, conform in enumerate(traj.xyz):
        np.copyto(rmsraw[i], np.array([LA.norm(conform-cent) for cent in self.data['centroid']]))
      logging.debug('  RMS:  %d points projected to %d centroid-distances', numConf, numLabels)

      # 2. Account for noise
      #    For now: noise is user-configured; TODO: Factor in to Kalman Filter
      noise = DEFAULT.OBS_NOISE
      stepsize = 500 if 'interval' not in config else int(config['interval'])
      nwidth = noise//(2*stepsize)
      noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0)

      # Notes: Delta_S == rmslist
      rmslist = np.array([noisefilt(rmsraw, i) for i in range(numConf)])

      # 3. Append new points into the data store. 
      #    r_idx is the returned list of indices for each new RMS point
      #  TODO: DECIDE on retaining a Reservoir Sample
      #    for each bin OR to just cache all points (per strata)
      #  Reservoir Sampliing is Stratified by subspaceHash
      # logging.debug('Creating reservoir Sample')
      # reservoirSampling(self.catalog, traj.xyz, rIdx, subspaceHash, 
      #     lambda x: tuple([x]+list(traj.xyz.shape[1:])), 
      #     'rms', lambda key: '%d_%d' % key)
      r_idx = []
      pipe = self.catalog.pipeline()
      for si in rmslist:
        pipe.rpush('subspace:rms', bytes(si))
      idxlist = pipe.execute()
      for i in idxlist:
        r_idx.append(int(i) - 1)

      logging.debug("R_Index Created (rms).")

      # 4. Apply Heuristics Labeling
      logging.debug('Applying Labeling Heuristic')
      rmslabel = []
      subspaceHash = {}
      for i, rms in enumerate(rmslist):
        #  Sort RMSD by proximity & set state A as nearest state's centroid
        prox = np.argsort(rms)
        A = prox[0]

        #  Calc Absolute proximity between nearest 2 states' centroids
        # THETA Calc derived from static run. it is based from the average std dev of all rms's from a static run
        #   of BPTI without solvent. It could be dynamically calculated, but is hard coded here
        #  The theta is divided by four based on the analysis of DEShaw:
        #   est based on ~3% of DEShaw data in transition (hence )
        avg_stddev = 0.34119404492089034
        theta = avg_stddev / 4.

        # NOTE: Original formulate was relative. Retained here for reference:  
        # Rel vs Abs: Calc relative proximity for top 2 nearest centroids   
        # relproximity = rms[A] / (rms[A] + rms[rs[1]])
        # B = rs[1] if relproximity > (.5 - theta) else A
        # proximity = abs(rms[prox[1]] - rms[A]) / (rms[prox[1]] + rms[A])  #relative
        proximity = abs(rms[prox[1]] - rms[A])    #abs

        #  (TODO:  Factor in more than top 2, better noise)
        #  Label secondary sub-state
        B = prox[1] if proximity < theta else A
        rmslabel.append((A, B))
        if (A, B) not in subspaceHash:
          subspaceHash[(A, B)] = []
          logging.debug("Found Label: %s", str((A, B)))
        subspaceHash[(A, B)].append(i)

      # Update Catalog
      idxcheck = self.catalog.append({'label:rms': rmslabel})
      bench.mark('RMS')


    # 5b. Subspace Calcuation: PCA
    #------ B:  PCA  -----------------
      # 1. Project Pt to PC's for each conform (top 3 PC's)
      logging.debug("Using following PCA Vectors: %s", str(self.data['pcaVectors'].shape))
      pcalist = datareduce.PCA(traj.xyz, self.data['pcaVectors'], numpc=3)

      # 2. Apend subspace in catalog
      p_idx = []
      pipe = self.catalog.pipeline()
      for si in pcalist:
        pipe.rpush('subspace:pca', bytes(si))
      idxlist = pipe.execute()
      for i in idxlist:
        p_idx.append(int(i) - 1)
      logging.debug("P_Index Created (pca) for delta_S_pca")

      # 3. Performing tiling over subspace
      #   For Now: Load entire tree into local memory
      hcube_mapping = json.loads(self.catalog.get('hcube:pca'))
      logging.debug('# Loaded keys = %d', len(hcube_mapping.keys()))

      # 4. Pull entire Subspace (for now)  
      #   Note: this is more efficient than inserting new points
      #   due to underlying Redis Insertions / Index look up
      #   If this become a bottleneck, may need to write custom redis client
      #   connection to persist connection and keep socket open (on both ends)
      #   Or handle some of this I/O server-side via Lua scipt
      packed_subspace = self.catalog.lrange('subspace:pca', 0, -1)
      subspace_pca = np.array([np.fromstring(x) for x in packed_subspace])

      # TODO: accessor function is for 1 point (i) and 1 axis (j). 
      #  Optimize by changing to pipeline  retrieval for all points given 
      #  a list of indices with an axis (if nec'y)
      logging.debug("Reconstructing the tree...")
      hcube_tree = KDTree.reconstruct(hcube_mapping, subspace_pca)

      # logging.debug("Inserting Delta_S_pca into KDtree (hcubes)")
      # for i in range(len(pcalist)):
      #   hcube_tree.insert(pcalist[i], p_idx[i])

      # TODO: Ensure hcube_tree is written to catalog
      # TODO: DECIDE on retaining a Reservoir Sample
      # reservoirSampling(self.catalog, traj.xyz, r_idx, subspaceHash, 
      #     lambda x: tuple([x]+list(traj.xyz.shape[1:])), 
      #     'pca', 
      #     lambda key: '%d_%d' % key)
      bench.mark('PCA')

      bench.show()
      return [config['name']]
Exemplo n.º 15
0
    def execute(self):
        """Special execute function for the reweight operator -- check/validate.
      """
        # PRE-PROCESSING ---------------------------------------------------------------------------------
        logging.debug(
            "============================  <PRE-PROCESS>  ============================="
        )
        self.cacheclient = CacheClient(self.name)
        numLabels = 5
        binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
        labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
        num_pts = len(labeled_pts_rms)
        logging.debug('##NUM_OBS: %d', num_pts)

        # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
        TEST_TBIN = [(2, 0), (4, 2), (2, 2), (4, 1), (3, 1), (4, 4), (0, 4),
                     (0, 2), (0, 1)]
        MAX_SAMPLE_SIZE = 100  # Max # of cov traj to back project per HCube
        MAX_PT_PER_MATRIX = 100  # Num points to sample from each cov traj
        COVAR_SIZE = 200  # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
        MAX_HCUBE = 6  # Max Num HCubes to process

        # IMPLEMENT USER QUERY with REWEIGHTING:
        logging.debug(
            "=======================  <QUERY PROCESSING>  ========================="
        )

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

        logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
        logging.info("Retrieving All Covariance Vectors")
        home = os.getenv('HOME')
        cfile = home + '/work/DEBUG_COVAR_PTS'
        DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
        if DO_COVAR:
            if os.path.exists(cfile + '.npy'):
                covar_pts = np.load(cfile + '.npy')
                logging.debug('Loaded From File')
            else:
                covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
                covar_pts = np.array([np.fromstring(x) for x in covar_raw])
                np.save(cfile, covar_pts)
                logging.debug('Loaded From Catalog & Saved')
        covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
        logging.debug('Indiced Loaded. Retrieving File Indices')
        covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

        if DO_COVAR:
            logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
            logging.info(
                "Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)"
            )

            # FOR incrementatl PCA:
            NUM_PC = 6
            ipca_key = 'subspace:covar:ipca'
            ipca = PCAnalyzer.load(self.catalog, ipca_key)
            if ipca is None:
                logging.info('Creating a NEW IPCA')
                ipca = PCAIncremental(NUM_PC)
                lastindex = 0
            else:
                lastindex = ipca.trainsize
                logging.info(
                    'IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts',
                    ipca.trainsize,
                    len(covar_pts) - ipca.trainsize)

            # For incremental, partial solve using only newer pts (from the last "trainsize")
            if len(covar_pts) - lastindex > 0:
                ipca.solve(covar_pts[lastindex:])
                logging.info("Incrementatl PCA Updated. Storing Now...")

                ####  BARRIER
                self.wait_catalog()
                ipca.store(self.catalog, ipca_key)

            logging.info("IPCA Saved. Projecting Covariance to PC")

        cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
        if os.path.exists(cfile + '.npy'):
            subspace_covar_pts = np.load(cfile + '.npy')
        else:
            subspace_covar_pts = ipca.project(covar_pts)
            np.save(cfile, subspace_covar_pts)

        # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
        logging.info(
            'Building Global KD Tree over Covar Subspace with %d data pts',
            len(subspace_covar_pts))
        global_kdtree = KDTree(250,
                               maxdepth=8,
                               data=subspace_covar_pts,
                               method='middle')

        if MAX_HCUBE <= 0:
            hcube_global = global_kdtree.getleaves()
        else:
            # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
            hcube_global_ALL = global_kdtree.getleaves()
            hcube_global = {}
            num = 0
            for k, v in hcube_global_ALL.items():
                hcube_global[k] = v
                num += 1
                if num == MAX_HCUBE:
                    break

        # hcube_global = global_kdtree.getleaves()
        logging.info(
            'Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)'
        )
        for k in sorted(hcube_global.keys()):
            v = hcube_global[k]
            logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'],
                         v['volume'], v['density'])

        if self.filelog:
            keys = hcube_global.keys()
            self.filelog.info('global,keys,%s', ','.join(keys))
            self.filelog.info(
                'global,count,%s',
                ','.join([str(hcube_global[k]['count']) for k in keys]))
            self.filelog.info(
                'global,volume,%s',
                ','.join([str(hcube_global[k]['volume']) for k in keys]))
            self.filelog.info(
                'global,density,%s',
                ','.join([str(hcube_global[k]['density']) for k in keys]))

        logging.info(
            "=====  SELECT Sampling of points from each Global HCube  (B)")
        s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
        hcube_global = {x[0]: x[1] for x in s}

        counter = 0
        for key in hcube_global.keys():
            counter += 1
            if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE:
                cov_index = hcube_global[key]['elm']
                hcube_global[key]['samplefactor'] = 1
            else:
                cov_index = np.random.choice(hcube_global[key]['elm'],
                                             MAX_SAMPLE_SIZE)
                hcube_global[key]['samplefactor'] = len(
                    hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
            hcube_global[key]['idxlist'] = []
            for cov in cov_index:
                selected_hd_idx = np.random.choice(COVAR_SIZE,
                                                   MAX_PT_PER_MATRIX).tolist()
                hcube_global[key]['idxlist'].extend(
                    [int(covar_index[cov]) + i for i in selected_hd_idx])
            logging.info('Back Projecting Global HCube `%s`  (%d out of %d)',
                         key, counter, len(hcube_global.keys()))
            source_cov = self.backProjection(hcube_global[key]['idxlist'])
            hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
            logging.debug('Back Projected %d points to HD space: %s',
                          len(hcube_global[key]['idxlist']),
                          str(hcube_global[key]['alpha']))

        # logging.info('Calculating all HD Distances')
        # dist_hd = {}
        # dist_ld = {}
        # for key in hcube_global.keys():
        #   T = hcube_global[key]['alpha'].xyz
        #   N = len(T)
        #   dist_hd[key] = np.zeros(shape=(N, N))
        #   dist_ld[key] = {}
        #   for A in range(0, N):
        #     dist_hd[key][A][A] = 0
        #     for B in range(A+1, N):
        #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
        reservoir = ReservoirSample('rms', self.catalog)

        logging.info(
            "=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) "
        )
        hcube_list = {}

        logging.info(
            "Scanning current set of observed bins and finding all smallest with data (excluding largest 2)"
        )
        hcube_local = {}

        logging.info("=======================================================")
        logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
        logging.info(
            "=======================================================\n")

        overlap_hcube = {k: {} for k in hcube_global.keys()}

        projection_map = {}

        pt_projection_list = []
        for key in sorted(hcube_global.keys()):
            for i in range(len(hcube_global[key]['alpha'].xyz)):
                pt_projection_list.append([])
        for bin_idx, tbin in enumerate(TEST_TBIN):
            logging.info("Project Global HCubes into local subspace for %s",
                         str(tbin))
            # Load Vectors
            logging.info('Loading subspace and kernel for bin %s', str(tbin))

            # LOAD KPCA Kernel matrix
            kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
            kpca = PCAnalyzer.load(self.catalog, kpca_key)

            data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
            data = np.array([np.fromstring(x) for x in data_raw])
            if len(data) == 0:
                logging.error(
                    'No Raw PCA data points for bin %s.... Going to next bin',
                    str(tbin))
                continue

            logging.info(
                'Building KDtree over local %s bin from observations matrix of size: %s',
                str(tbin), str(data.shape))
            kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
            hcube_local[tbin] = kdtree.getleaves()
            logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
            for k in sorted(hcube_local[tbin].keys()):
                logging.info('    `%-9s`   #pts:%6d   density:%9.1f', k,
                             len(hcube_local[tbin][k]['elm']),
                             hcube_local[tbin][k]['density'])

            if self.filelog:
                keys = hcube_local[tbin].keys()
                A, B = tbin
                self.filelog.info('local,%d_%d,keys,%s', A, B, ','.join(keys))
                self.filelog.info(
                    'local,%d_%d,count,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['count']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,volume,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['volume']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,density,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['density']) for k in keys]))

            n_total = 0
            logging.debug('Global Hcubes to Project (%d):  %s',
                          len(hcube_global.keys()), str(hcube_global.keys()))
            projection_map[bin_idx] = {
                k: set()
                for k in hcube_local[tbin].keys()
            }

            pnum = 0
            for key in sorted(hcube_global.keys()):
                overlap_hcube[key][tbin] = {}
                cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

                logging.debug(
                    'PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ',
                    key, len(cov_proj_pca), str(tbin))
                for i, pt in enumerate(cov_proj_pca):
                    hcube = kdtree.probe(pt, probedepth=9)
                    # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
                    if hcube not in overlap_hcube[key][tbin]:
                        overlap_hcube[key][tbin][hcube] = {
                            'idxlist': hcube_local[tbin][hcube]['elm'],
                            'wgt': hcube_local[tbin][hcube]['density'],
                            'num_projected': 0
                        }
                    overlap_hcube[key][tbin][hcube]['num_projected'] += 1

                    # Index this point in corresponding local HCube projection view
                    projection_map[bin_idx][hcube].add(pnum)

                    pt_projection_list[pnum].append(hcube)
                    pnum += 1

                for k, v in sorted(overlap_hcube[key][tbin].items()):
                    logging.debug(
                        '   Project ==> Local HCube `%-9s`: %5d points', k,
                        v['num_projected'])
                # logging.info('Calculating Lower Dimensional Distances')
                # N = len(cov_proj_pca)
                # dist_ld[key][tbin] = np.zeros(shape=(N, N))
                # for A in range(0, N):
                #   for B in range(A+1, N):
                #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])

    # Re-Index projected points -- could make this a list too

        next_index = 0
        view_list = []
        for bin_idx, hcube_map in projection_map.items():
            hcube_list = []
            for hcube_key, pt_list in hcube_map.items():
                hcube_list.append((set((hcube_key, )), set(pt_list)))
            view_list.append((set((bin_idx, )), hcube_list))

        print("CALLING: Collapse Join")
        joined_subspaces = collapse_join(projection_map.keys(), view_list)
        for subspace_list, correlated_hcubes in joined_subspaces:
            tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
            for hcube_list, pt_list in correlated_hcubes:
                print(tbin_list, hcube_list, pt_list)
                # TODO: Corrlate Back to Global
        print('Visualize HERE')

        # for idx, tbin in enumerate(TEST_TBIN):
        #   # Only process substates with data
        #   if tbin not in hcube_local:
        #     logging.warning('Local KD Tree not created for %s', str(tbin))
        #     continue
        #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # sets = {}
        # proj_bin_list = []
        # for tbin in TEST_TBIN:
        #   if tbin not in hcube_local:
        #     continue
        #   proj_bin_list.append(tbin)
        #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # set_list = {}
        # for tbin, view in sets.items():
        #   set_list[(tbin,)] = []
        #   for hcube, idxlist in view.items():
        #     print(tbin, hcube, idxlist)
        #     set_list[(tbin,)].append((set((hcube,)), idxlist))

        # def collapse(C):
        #   a = 0
        #   b = 0
        #   N = []
        #   while a < len(C) and b < len(C):
        #     A = sorted(C[a])
        #     B = sorted(C[b])
        #     if A == B:
        #       b += 1
        #     elif A[0] == B[0]:
        #       N.append(set(A)|set(B))
        #       b += 1
        #     else:
        #       a += 1
        #   if len(N) <= 1:
        #     return []
        #   else:
        #     return N + collapse(N)

        # q=collapse(t1)
        # for i in q: print(sorted(i))

        # print('Checking all 2-Way Joins')
        # join2 = {}
        # for a in range(0, len(proj_bin_list)-1):
        #   tA = proj_bin_list[a]
        #   for b in range(a+1, len(proj_bin_list)):
        #     tB = proj_bin_list[b]
        #     join_ss = tuple(set((tA, tB)))
        #     set_list = []
        #     for kA, vA in sets[tA].items():
        #       for kB, vB in sets[tB].items():
        #         join_hc = set((kA, kB))
        #         inter = vA & vB
        #         if len(inter) > 0:
        #           set_list.append((join_hc, inter))
        #     if len(set_list) > 0:
        #       join2[join_ss] = set_list
        # print('2-Way Join Results:')
        # for ss, set_list in join2.items():
        #   for hc, idxlist in set_list:
        #     print(ss, hc, idxlist)

        # print('Checking all 3-Way Joins')
        # join3 = []
        # checked = []
        # for a in range(0, len(join2)-1):
        #   sA, hA, vA = join2[a]
        #   for b in range(a+1, len(join2)):
        #     sB, hB, vB = join2[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join3.append((ss, hc, inter))

        # print('Checking all 4-Way Joins')
        # join4 = []
        # checked = []
        # for a in range(0, len(join3)-1):
        #   sA, hA, vA = join3[a]
        #   for b in range(a+1, len(join3)):
        #     sB, hB, vB = join3[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join4.append((ss, hc, inter))

        # if self.filelog:
        #   for i in join2:
        #     self.filelog.info('%s', str(i))
        #   for i in join3:
        #     self.filelog.info('%s', str(i))
        #   for i in join4:
        #     self.filelog.info('%s', str(i))

        DO_MIN_CHECK = False
        if DO_MIN_CHECK:

            def maxcount(x):
                y = {}
                for i in x:
                    y[i] = 1 if i not in y else y[i] + 1
                return max(y.values())

            print(
                '%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces'
            )
            argmin_nonzero = lambda x: np.argmin([(i if i > 0 else np.inf)
                                                  for i in x])
            for key in hcube_global.keys():
                # logging.info('Showing MIN / MAX for points from HCube %s:', key)
                minA = {}
                maxA = {}
                for n in range(len(dist_hd[key])):
                    minA[n] = []
                    maxA[n] = []
                    for tbin in TEST_TBIN:
                        if tbin not in dist_ld[key].keys():
                            continue
                            minA[n].append(0)
                            maxA[n].append(0)
                        else:
                            minA[n].append(
                                argmin_nonzero(dist_ld[key][tbin][n]))
                            maxA[n].append(np.argmax(dist_ld[key][tbin][n]))
                numsame = np.zeros(len(dist_ld[key].keys()) + 1)
                for n in range(len(dist_hd[key][n])):
                    minH = argmin_nonzero(dist_hd[key][n])
                    maxH = np.argmax(dist_hd[key][n])
                    minmax = ['%2d/%-2d' % i for i in zip(minA[n], maxA[n])]
                    numsamepair = maxcount(minA[n])
                    numsame[numsamepair] += 1
                    # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
                print(' '.join([
                    '%4.1f%%' % i for i in (100 * (numsame / np.sum(numsame)))
                ]))

        print('Stopping HERE!')
        sys.exit(0)
        #  GAMMA FUNCTION EXPR # 8
        gamma1 = lambda a, b: (a * b)
        gamma2 = lambda a, b: (a + b) / 2

        # TODO: Factor in RMS weight
        for tbin in TEST_TBIN:
            # for tbin in sorted(bin_list):
            logging.info('')
            logging.info('BIPARTITE GRAPH for %s', str(tbin))
            bipart = {}
            edgelist = []
            for hcB in hcube_global.keys():
                num_B = hcube_global[hcB]['count']
                wgt1_B = hcube_global[hcB]['density']
                if tbin not in overlap_hcube[hcB]:
                    continue
                for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
                    edge = {}
                    if hcA not in bipart:
                        bipart[hcA] = []
                    num_proj = hcA_data['num_projected']
                    wgt_A = hcA_data['wgt']
                    wgt2_B = wgt1_B * num_proj
                    edge['combW1'] = gamma1(wgt_A, wgt1_B)
                    edge['combW2'] = gamma1(wgt_A, wgt2_B)
                    edge['combW3'] = gamma2(wgt_A, wgt1_B)
                    edge['combW4'] = gamma2(wgt_A, wgt2_B)
                    edge['num_A'] = len(hcA_data['idxlist'])
                    edge['num_B'] = num_B
                    edge['num_proj'] = num_proj
                    edge['wgt_A'] = wgt_A
                    edge['wgt1_B'] = wgt1_B
                    edge['wgt2_B'] = wgt2_B
                    edge['hcA'] = hcA
                    edge['hcB'] = hcB
                    bipart[hcA].append(edge)
                    edgelist.append((hcA, hcB, num_proj))
            if len(bipart) == 0:
                logging.info("NO DATA FOR %s", str(tbin))
                continue
            logging.info('')
            logging.info(
                'A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)'
            )
            for k, v in bipart.items():
                for edge in v:
                    logging.info(
                        'A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f'
                        % edge)
                    if self.filelog:
                        A, B = tbin
                        self.filelog.info('edge,%d_%d,%s,%s,%d', A, B,
                                          edge['hcA'], edge['hcB'],
                                          edge['num_proj'])

            # Prepare nodes for graph
            nA = set()
            nB = set()
            elist = []
            for e in edgelist:
                a, b, z = e
                if z <= 5:
                    continue
                nA.add(a)
                nB.add(b)
                elist.append((a, b, z))
            nAKeys = sorted(nA)[::-1]
            nBKeys = sorted(nB)[::-1]
            sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
            sizesB = [hcube_global[n]['count'] * 3 for n in nBKeys]
            idxA = {key: i for i, key in enumerate(nAKeys)}
            idxB = {key: i for i, key in enumerate(nBKeys)}
            edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
            G.bipartite(sizesA, sizesB, edges, sizesA, sizesB,
                        'bipartite_%d_%d' % tbin)

        logging.info('STOPPING HERE!!!!')
        sys.exit(0)
        return []
Exemplo n.º 16
0
    def execute(self):
      """Special execute function for the reweight operator -- check/validate.
      """
    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")
      self.cacheclient = CacheClient(self.name)
      numLabels = 5
      binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
      labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
      num_pts = len(labeled_pts_rms)
      logging.debug('##NUM_OBS: %d', num_pts)

      # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
      TEST_TBIN = [(2,0), (4,2), (2,2), (4,1), (3,1), (4,4), (0,4), (0,2), (0,1)]
      MAX_SAMPLE_SIZE   =  100   # Max # of cov traj to back project per HCube
      MAX_PT_PER_MATRIX =  100   # Num points to sample from each cov traj
      COVAR_SIZE        = 200   # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
      MAX_HCUBE         = 6      # Max Num HCubes to process


    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

      logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
      logging.info("Retrieving All Covariance Vectors")
      home = os.getenv('HOME')
      cfile = home + '/work/DEBUG_COVAR_PTS'
      DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
      if DO_COVAR: 
        if os.path.exists(cfile + '.npy'):
          covar_pts = np.load(cfile + '.npy')
          logging.debug('Loaded From File')
        else: 
          covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
          covar_pts = np.array([np.fromstring(x) for x in covar_raw])
          np.save(cfile, covar_pts)
          logging.debug('Loaded From Catalog & Saved')
      covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
      logging.debug('Indiced Loaded. Retrieving File Indices')
      covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

      if DO_COVAR: 
        logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
        logging.info("Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)")

        # FOR incrementatl PCA:
        NUM_PC = 6
        ipca_key = 'subspace:covar:ipca'
        ipca = PCAnalyzer.load(self.catalog, ipca_key)
        if ipca is None:
          logging.info('Creating a NEW IPCA')
          ipca = PCAIncremental(NUM_PC)
          lastindex = 0
        else:
          lastindex = ipca.trainsize
          logging.info('IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', 
            ipca.trainsize, len(covar_pts)-ipca.trainsize)

        # For incremental, partial solve using only newer pts (from the last "trainsize")
        if len(covar_pts)-lastindex > 0:
          ipca.solve(covar_pts[lastindex:])
          logging.info("Incrementatl PCA Updated. Storing Now...")

          ####  BARRIER 
          self.wait_catalog()
          ipca.store(self.catalog, ipca_key)

        logging.info("IPCA Saved. Projecting Covariance to PC")

      cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
      if os.path.exists(cfile + '.npy'):
        subspace_covar_pts = np.load(cfile + '.npy')
      else: 
        subspace_covar_pts = ipca.project(covar_pts)
        np.save(cfile, subspace_covar_pts)

      # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
      logging.info('Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts))
      global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle')
  

      if MAX_HCUBE <= 0:
        hcube_global = global_kdtree.getleaves()
      else:
      # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
        hcube_global_ALL = global_kdtree.getleaves()
        hcube_global = {}
        num = 0
        for k, v in hcube_global_ALL.items():
          hcube_global[k] = v
          num += 1
          if num == MAX_HCUBE:
            break

      # hcube_global = global_kdtree.getleaves()
      logging.info('Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)')
      for k in sorted(hcube_global.keys()):
        v = hcube_global[k]
        logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density'])

      if self.filelog:
        keys = hcube_global.keys()
        self.filelog.info('global,keys,%s',','.join(keys))
        self.filelog.info('global,count,%s',','.join([str(hcube_global[k]['count']) for k in keys]))
        self.filelog.info('global,volume,%s',','.join([str(hcube_global[k]['volume']) for k in keys]))
        self.filelog.info('global,density,%s',','.join([str(hcube_global[k]['density']) for k in keys]))

      logging.info("=====  SELECT Sampling of points from each Global HCube  (B)")
      s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
      hcube_global = {x[0]: x[1] for x in s}


      counter = 0
      for key in hcube_global.keys():
        counter += 1
        if hcube_global[key]['count']  <= MAX_SAMPLE_SIZE:
          cov_index = hcube_global[key]['elm']
          hcube_global[key]['samplefactor'] = 1
        else:
          cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE)
          hcube_global[key]['samplefactor'] = len(hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
        hcube_global[key]['idxlist'] = []
        for cov in cov_index:
          selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist()
          hcube_global[key]['idxlist'].extend([int(covar_index[cov]) + i for i in selected_hd_idx])
        logging.info('Back Projecting Global HCube `%s`  (%d out of %d)', key, counter, len(hcube_global.keys()))
        source_cov = self.backProjection(hcube_global[key]['idxlist'])
        hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
        logging.debug('Back Projected %d points to HD space: %s', 
          len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha']))

      # logging.info('Calculating all HD Distances')
      # dist_hd = {}
      # dist_ld = {}
      # for key in hcube_global.keys():
      #   T = hcube_global[key]['alpha'].xyz
      #   N = len(T)
      #   dist_hd[key] = np.zeros(shape=(N, N))
      #   dist_ld[key] = {}
      #   for A in range(0, N):
      #     dist_hd[key][A][A] = 0
      #     for B in range(A+1, N):
      #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])
        

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
      reservoir = ReservoirSample('rms', self.catalog)

      logging.info("=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) ")
      hcube_list = {}

      logging.info("Scanning current set of observed bins and finding all smallest with data (excluding largest 2)")
      hcube_local = {}

      logging.info("=======================================================")
      logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
      logging.info("=======================================================\n")

      overlap_hcube = {k: {} for k in hcube_global.keys()}

      projection_map = {}


      pt_projection_list = []
      for key in sorted(hcube_global.keys()):
        for i in range(len(hcube_global[key]['alpha'].xyz)):
          pt_projection_list.append([])
      for bin_idx, tbin in enumerate(TEST_TBIN):
        logging.info("Project Global HCubes into local subspace for %s", str(tbin))
        # Load Vectors
        logging.info('Loading subspace and kernel for bin %s', str(tbin))

        # LOAD KPCA Kernel matrix
        kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
        kpca = PCAnalyzer.load(self.catalog, kpca_key)

        data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
        data = np.array([np.fromstring(x) for x in data_raw])
        if len(data) == 0:
          logging.error('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
          continue


        logging.info('Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape))
        kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
        hcube_local[tbin] = kdtree.getleaves()
        logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
        for k in sorted(hcube_local[tbin].keys()):
          logging.info('    `%-9s`   #pts:%6d   density:%9.1f', 
            k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density'])

        if self.filelog:
          keys = hcube_local[tbin].keys()
          A,B = tbin
          self.filelog.info('local,%d_%d,keys,%s',A,B,','.join(keys))
          self.filelog.info('local,%d_%d,count,%s',A,B,','.join([str(hcube_local[tbin][k]['count']) for k in keys]))
          self.filelog.info('local,%d_%d,volume,%s',A,B,','.join([str(hcube_local[tbin][k]['volume']) for k in keys]))
          self.filelog.info('local,%d_%d,density,%s',A,B,','.join([str(hcube_local[tbin][k]['density']) for k in keys]))          

        n_total = 0
        logging.debug('Global Hcubes to Project (%d):  %s', len(hcube_global.keys()), str(hcube_global.keys()))
        projection_map[bin_idx] = {k: set() for k in hcube_local[tbin].keys()}
        
        pnum = 0
        for key in sorted(hcube_global.keys()):
          overlap_hcube[key][tbin] = {}
          cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

          logging.debug('PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ', 
            key, len(cov_proj_pca), str(tbin))
          for i, pt in enumerate(cov_proj_pca):
            hcube = kdtree.probe(pt, probedepth=9)
            # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
            if hcube not in overlap_hcube[key][tbin]:
              overlap_hcube[key][tbin][hcube] = {
                  'idxlist': hcube_local[tbin][hcube]['elm'],
                  'wgt': hcube_local[tbin][hcube]['density'], 
                  'num_projected': 0}
            overlap_hcube[key][tbin][hcube]['num_projected'] += 1

            # Index this point in corresponding local HCube projection view
            projection_map[bin_idx][hcube].add(pnum)

            pt_projection_list[pnum].append(hcube)
            pnum += 1

          for k, v in sorted(overlap_hcube[key][tbin].items()):
            logging.debug('   Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected'])
          # logging.info('Calculating Lower Dimensional Distances')
          # N = len(cov_proj_pca)
          # dist_ld[key][tbin] = np.zeros(shape=(N, N))
          # for A in range(0, N):
          #   for B in range(A+1, N):
          #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])


    # Re-Index projected points -- could make this a list too

      next_index = 0
      view_list = []
      for bin_idx, hcube_map in projection_map.items():
        hcube_list = []
        for hcube_key, pt_list in hcube_map.items():
          hcube_list.append((set((hcube_key,)), set(pt_list)))
        view_list.append((set((bin_idx,)), hcube_list))

      print("CALLING: Collapse Join")
      joined_subspaces = collapse_join(projection_map.keys(), view_list)
      for subspace_list, correlated_hcubes in joined_subspaces:
        tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
        for hcube_list, pt_list in correlated_hcubes:
          print(tbin_list, hcube_list, pt_list)
          # TODO: Corrlate Back to Global 
      print('Visualize HERE')



      # for idx, tbin in enumerate(TEST_TBIN):
      #   # Only process substates with data
      #   if tbin not in hcube_local:
      #     logging.warning('Local KD Tree not created for %s', str(tbin))
      #     continue
      #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))


      # sets = {}
      # proj_bin_list = []
      # for tbin in TEST_TBIN:
      #   if tbin not in hcube_local:
      #     continue
      #   proj_bin_list.append(tbin)
      #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))

      # set_list = {}
      # for tbin, view in sets.items():
      #   set_list[(tbin,)] = []
      #   for hcube, idxlist in view.items():
      #     print(tbin, hcube, idxlist)
      #     set_list[(tbin,)].append((set((hcube,)), idxlist))


      # def collapse(C):
      #   a = 0
      #   b = 0
      #   N = []
      #   while a < len(C) and b < len(C):
      #     A = sorted(C[a])
      #     B = sorted(C[b])
      #     if A == B:
      #       b += 1
      #     elif A[0] == B[0]:
      #       N.append(set(A)|set(B))
      #       b += 1
      #     else:
      #       a += 1
      #   if len(N) <= 1:
      #     return []
      #   else:
      #     return N + collapse(N)

      # q=collapse(t1)
      # for i in q: print(sorted(i))


      # print('Checking all 2-Way Joins')
      # join2 = {}
      # for a in range(0, len(proj_bin_list)-1):
      #   tA = proj_bin_list[a]
      #   for b in range(a+1, len(proj_bin_list)):
      #     tB = proj_bin_list[b]
      #     join_ss = tuple(set((tA, tB)))
      #     set_list = []
      #     for kA, vA in sets[tA].items():
      #       for kB, vB in sets[tB].items():
      #         join_hc = set((kA, kB))
      #         inter = vA & vB
      #         if len(inter) > 0:
      #           set_list.append((join_hc, inter))
      #     if len(set_list) > 0:
      #       join2[join_ss] = set_list
      # print('2-Way Join Results:')
      # for ss, set_list in join2.items():
      #   for hc, idxlist in set_list:
      #     print(ss, hc, idxlist)


      # print('Checking all 3-Way Joins')
      # join3 = []
      # checked = []
      # for a in range(0, len(join2)-1):
      #   sA, hA, vA = join2[a]
      #   for b in range(a+1, len(join2)):
      #     sB, hB, vB = join2[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join3.append((ss, hc, inter))


      # print('Checking all 4-Way Joins')
      # join4 = []
      # checked = []
      # for a in range(0, len(join3)-1):
      #   sA, hA, vA = join3[a]
      #   for b in range(a+1, len(join3)):
      #     sB, hB, vB = join3[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join4.append((ss, hc, inter))

      # if self.filelog:
      #   for i in join2:
      #     self.filelog.info('%s', str(i))
      #   for i in join3:
      #     self.filelog.info('%s', str(i))
      #   for i in join4:
      #     self.filelog.info('%s', str(i))

      DO_MIN_CHECK = False
      if DO_MIN_CHECK:
        def maxcount(x):
          y={}
          for i in x:
            y[i] = 1 if i not in y else y[i]+1
          return max(y.values())

        print('%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces')
        argmin_nonzero = lambda x: np.argmin([(i if i>0 else np.inf) for i in x])
        for key in hcube_global.keys():
          # logging.info('Showing MIN / MAX for points from HCube %s:', key)
          minA = {}; maxA={}
          for n in range(len(dist_hd[key])) :
            minA[n]=[] ; maxA[n]=[]
            for tbin in TEST_TBIN:
              if tbin not in dist_ld[key].keys():
                continue
                minA[n].append(0)
                maxA[n].append(0)          
              else:
                minA[n].append(argmin_nonzero(dist_ld[key][tbin][n]))
                maxA[n].append(np.argmax(dist_ld[key][tbin][n]))          
          numsame = np.zeros(len(dist_ld[key].keys())+1)
          for n in range(len(dist_hd[key][n])):
            minH = argmin_nonzero(dist_hd[key][n])
            maxH = np.argmax(dist_hd[key][n])
            minmax = ['%2d/%-2d'%i for i in zip(minA[n], maxA[n])]
            numsamepair = maxcount(minA[n])
            numsame[numsamepair] += 1
            # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
          print(' '.join(['%4.1f%%'%i for i in (100* (numsame/np.sum(numsame)))]))

      print('Stopping HERE!')
      sys.exit(0)
      #  GAMMA FUNCTION EXPR # 8
      gamma1 = lambda a, b : (a * b)
      gamma2 = lambda a, b : (a + b) / 2

      # TODO: Factor in RMS weight
      for tbin in TEST_TBIN:
      # for tbin in sorted(bin_list):
        logging.info('')
        logging.info('BIPARTITE GRAPH for %s', str(tbin))
        bipart = {}
        edgelist = []
        for hcB in hcube_global.keys():
          num_B  = hcube_global[hcB]['count']
          wgt1_B = hcube_global[hcB]['density']
          if tbin not in overlap_hcube[hcB]:
            continue
          for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
            edge = {}
            if hcA not in bipart:
              bipart[hcA] = []  
            num_proj  = hcA_data['num_projected']
            wgt_A  = hcA_data['wgt']
            wgt2_B = wgt1_B*num_proj
            edge['combW1'] = gamma1(wgt_A, wgt1_B)
            edge['combW2'] = gamma1(wgt_A, wgt2_B)
            edge['combW3'] = gamma2(wgt_A, wgt1_B)
            edge['combW4'] = gamma2(wgt_A, wgt2_B)
            edge['num_A']  = len(hcA_data['idxlist'])
            edge['num_B']  = num_B
            edge['num_proj']  = num_proj
            edge['wgt_A']  = wgt_A
            edge['wgt1_B'] = wgt1_B
            edge['wgt2_B'] = wgt2_B
            edge['hcA'] = hcA
            edge['hcB'] = hcB
            bipart[hcA].append(edge)
            edgelist.append((hcA, hcB, num_proj))
        if len(bipart) == 0:
          logging.info("NO DATA FOR %s", str(tbin))
          continue
        logging.info('')
        logging.info('A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)')
        for k, v in bipart.items():
          for edge in v:
            logging.info('A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge)
            if self.filelog:
              A,B = tbin
              self.filelog.info('edge,%d_%d,%s,%s,%d',A,B,edge['hcA'],edge['hcB'],edge['num_proj'])

        # Prepare nodes for graph
        nA = set()
        nB = set()
        elist = []
        for e in edgelist:
          a, b, z = e
          if z <= 5:
            continue
          nA.add(a)
          nB.add(b)
          elist.append((a,b,z))
        nAKeys = sorted(nA)[::-1]
        nBKeys = sorted(nB)[::-1]
        sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
        sizesB = [hcube_global[n]['count']*3 for n in nBKeys]
        idxA = {key: i for i, key in enumerate(nAKeys)}
        idxB = {key: i for i, key in enumerate(nBKeys)}
        edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
        G.bipartite(sizesA,sizesB,edges,sizesA,sizesB,'bipartite_%d_%d' % tbin)

      logging.info('STOPPING HERE!!!!')
      sys.exit(0)
      return []