def reproject_oldata(): r1 = redis.StrictRedis(port=6390, decode_responses=True) cache = redis.StrictRedis(host='bigmem0006', port=6380, decode_responses=True) execlist = r1.hgetall('anl_sequence') keyorder = ['jc_'+i[0] for i in sorted(execlist.items(), key=lambda x:x[1])] # skip first 100 (non-sampled) pts = [] bad_ref = 0 miss = 0 for key in keyorder: conf = r1.hgetall(key) src = int(conf['src_index']) ref = r1.lindex('xid:reference', src) if ref is not None: fileno, frame = eval(ref) ckey = 'sim:%s' % conf['name'] xyz = cache.lindex(ckey, frame) if xyz is not None: pts.append(pickle.loads(xyz)) else: tr = md.load_frame(conf['dcd'], frame, top=conf['pdb']) if len(tr.xyz) == 0: miss += 1 else: pts.append(tr.xyz[0]) else: bad_ref += 1 traj = md.Trajectory(pts, deshaw.topo_prot.top) alpha = datareduce.filter_alpha(traj) return alpha
def high_low_check(r, tbin='(0, 4)'): print('Pulling data...') obslist=rb.lrange('label:rms', 0, -1) ob04 = [i for i, o in enumerate(obslist) if o == tbin] traj = backProjection(r, ob04) alpha = datareduce.filter_alpha(traj) print('Kpca') kpca1 = PCAKernel(6, 'rbf') kpca1.solve(alpha.xyz) X = kpca1.project(alpha.xyz) print('KDTree1') kdtree1 = KDTree(50, maxdepth=4, data=X, method='median') hc1 = kdtree1.getleaves() print('KDTree2') Y = alpha.xyz.reshape(alpha.n_frames, 174) kdtree2 = KDTree(50, maxdepth=4, data=Y, method='median') hc2 = kdtree2.getleaves() hc1k = sorted(hc1.keys()) hc2k = sorted(hc2.keys()) s1 = [set(hc1[k]['elm']) for k in hc1k] s2 = [set(hc2[k]['elm']) for k in hc2k] dd = np.zeros(shape=(len(s1), len(s2))) print(' ', ' '.join(hc1k)) for i, a in enumerate(s1): print(' ' +hc1k[i], end=' ') for j, b in enumerate(s2): n = len(a & b) print('%4d'%n, end=' ') dd[i][j] = n print('\n', end=' ') return dd
def reproject_oldata(): r1 = redis.StrictRedis(port=6390, decode_responses=True) cache = redis.StrictRedis(host='bigmem0006', port=6380, decode_responses=True) execlist = r1.hgetall('anl_sequence') keyorder = [ 'jc_' + i[0] for i in sorted(execlist.items(), key=lambda x: x[1]) ] # skip first 100 (non-sampled) pts = [] bad_ref = 0 miss = 0 for key in keyorder: conf = r1.hgetall(key) src = int(conf['src_index']) ref = r1.lindex('xid:reference', src) if ref is not None: fileno, frame = eval(ref) ckey = 'sim:%s' % conf['name'] xyz = cache.lindex(ckey, frame) if xyz is not None: pts.append(pickle.loads(xyz)) else: tr = md.load_frame(conf['dcd'], frame, top=conf['pdb']) if len(tr.xyz) == 0: miss += 1 else: pts.append(tr.xyz[0]) else: bad_ref += 1 traj = md.Trajectory(pts, deshaw.topo_prot.top) alpha = datareduce.filter_alpha(traj) return alpha
def kpca_check(red_db, tbin='(0, 4)'): if isinstance(red_db, list): rlist = red_db else: rlist = [red_db] trajlist = [] for r in rlist: print('Pulling data...') obslist = r.lrange('label:rms', 0, -1) idxlist = [i for i, o in enumerate(obslist) if o == tbin] traj = dh.backProjection(r, idxlist) alpha = datareduce.filter_alpha(traj) trajlist.append(alpha) deidx = lambda i: deidx_cutlist(i, [t.n_frames for t in trajlist]) print('Kpca') kpca1 = PCAKernel(6, 'rbf') kpca1.solve(alpha.xyz) X = kpca1.project(alpha.xyz) print('KDTree1') kdtree1 = KDTree(50, maxdepth=4, data=X, method='median') hc1 = kdtree1.getleaves() srcidx = [[i[0] \ for i in db.runquery("select idx from jc where bin='0_4' and expid=%d"%e)] \ for e in range(32, 36)] src_traj = [dh.backProjection(r, i) for r, i in zip(rlist, srcidx)] src_xyz = [datareduce.filter_alpha(t).xyz for t in src_traj] probe_res = [[kdtree1.project(i.reshape(174,)) for i in xyz] for xyz in src_xyz] grp_src = [] for p, s in zip(probe_res, srcidx): grp = {} for h, i in zip(p, s): if h not in grp: grp[h] = [] grp[h].append(i) grp_src.append(grp) idx_se_map = [{i: (s, e) for i, s, e in db.runquery("select idx, start, end from jc where bin='0_4' and expid=%d"%eid)} for eid in range(32, 36)]
def ld_wells(self): for x, i in enumerate(self.conf): if i['origin'] == 'deshaw': A, B = eval(i['src_bin']) if A == B: traj = md.load(self.conf[A]['dcd'], top=self.conf[A]['pdb']) traj.center_coordinates() alpha = dr.filter_alpha(traj) maxf = min(1000, alpha.n_frames) for i in alpha.xyz[:maxf]: self.wells[A].append(i)
def loadtraj(self, tr, first=None): if isinstance(tr, list): trlist = tr else: trlist = [tr] for t in trlist: traj = md.load(self.conf[t]['dcd'], top=self.conf[t]['pdb']) # traj.center_coordinates() if first is not None: traj = traj.slice(np.arange(first)) alpha = datareduce.filter_alpha(traj) # alpha.superpose(deshaw.topo_alpha) self.trlist[t] = alpha
def backProjection(db, index_list): """Perform OFFLINE back projection function for a list of indices using given DB. Return a list of high dimensional points (one per index). Assumes NO CACHE or DESHAW. """ logging.debug('-------- BACK PROJECTION: %d POINTS ---', len(index_list)) # Derefernce indices to file, frame tuple: pipe = db.pipeline() for idx in index_list: pipe.lindex('xid:reference', int(idx)) generated_framelist = pipe.execute() # Group all Generated indidces by file index groupbyFileIdx = {} for i, idx in enumerate(generated_framelist): try: file_index, frame = eval(idx) except TypeError as e: print('Bad Index:', str(idx)) continue if file_index not in groupbyFileIdx: groupbyFileIdx[file_index] = [] groupbyFileIdx[file_index].append(frame) # Dereference File index to filenames generated_frameMask = {} generated_filemap = {} for file_index in groupbyFileIdx.keys(): filename = db.lindex('xid:filelist', file_index) if filename is None: logging.warning('Error file not found in catalog: %s', filename) if not os.path.exists(filename): logging.warning('DCD File not found: %s', filename) else: key = os.path.splitext(os.path.basename(filename))[0] generated_frameMask[key] = groupbyFileIdx[file_index] generated_filemap[key] = filename # Add high-dim points to list of source points in a trajectory # Optimized for parallel file loading logging.debug('Sequentially Loading all trajectories') source_points = [] for key, framelist in generated_frameMask.items(): traj = datareduce.load_trajectory(generated_filemap[key]) traj = datareduce.filter_alpha(traj) selected_frames = traj.slice(framelist) source_points.extend(selected_frames.xyz) return np.array(source_points)
def centroid_bootstrap(catalog): centfile = settings.RMSD_CENTROID_FILE centroid = np.load(centfile) cent_npts = [1, 1, 1, 1, 1] # TBD numLabels = len(centroid) binlist = [(a, b) for a in range(numLabels) for b in range(numLabels)] logging.info("Loaded Starting Centroids from %s", centfile) name = catalog.get('name') if name is None: logging.info('Name not configured in this catalog. Set it and try again') return # Load/Set initial (current) Configs from Catalog if catalog.exists('thetas'): thetas = catalog.loadNPArray('thetas') else: thetas = np.zeros(shape=(numLabels, numLabels)) thetas[:] = 0.25 if catalog.exists('transition_sensitivity'): trans_factor = catalog.loadNPArray('transition_sensitivity') else: trans_factor = 0.2 use_gradient = True obs_count = {ab: 0 for ab in binlist} C_delta = [] T_delta = [] # Configure Noise Filter noise = int(catalog.get('obs_noise')) dcdfreq = int(catalog.get('dcdfreq')) stepsize = int(catalog.get('sim_step_size')) nwidth = noise//(2*stepsize) noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0) # Get previously Labeled data (or label data IAW current settings) eid = db.get_expid(name) obslist = [i[0] for i in db.runquery('SELECT obs FROM obs WHERE expid=%d' % eid)] jobs = [i[0] for i in sorted(catalog.hgetall('anl_sequence').items(), key=lambda x: x[1])] shape = None # Initialize lists for pair-wise distances (top 2 nearest centroids) diffList = {} transList = {} scatPlot = {} for A in range(0, numLabels-1): for B in range(A+1, numLabels): diffList[(A, B)] = [] transList[(A, B)] = [] scatPlot[(A, B)] = [] allScat = [] # Load trajectories & filter obs_global = [] # Process learning in batches (static batch size to start) batch_size = 25 max_obs = 150 batch = 0 while batch <= max_obs: logging.info("Procssing Jobs %d - %d", batch, batch+batch_size) exec_sim = [] obs_list = [] for job in jobs[batch:batch+25]: conf = catalog.hgetall('jc_' + job) traj = md.load(conf['dcd'], top=conf['pdb']) alpha = datareduce.filter_alpha(traj) conf['alpha'] = alpha.xyz exec_sim.append(conf) if shape is None: shape = conf['alpha'].shape[1:] # xyz_filtered = np.array([noisefilt(alpha.xyz, i) for i in range(alpha.n_frames)]) rmslist = calc_rmsd(alpha, centroid) labels = [] for rms in rmslist: # [cw[i]*LA.norm(pt - centroid[i]) for i in range(5)] A, B = np.argsort(rms)[:2] delta = np.abs(rms[B] - rms[A]) if delta < thetas[A][B]: sub_state = B else: sub_state = A classify = (A, sub_state) labels.append(classify) obs_count[classify] += 1 # For globally updating Thetas obs_global.append(classify) if A < B: diffList[(A, B)].append(rms[A] - rms[B]) else: diffList[(B, A)].append(rms[B] - rms[A]) for a in range(0, numLabels-1): for b in range(a+1, numLabels): transList[(a, b)].append(rms[a] - rms[b]) if (a, a) == classify or (b, b) == classify: c = 'b' elif (a, b) == classify or (b, a) == classify: c = 'g' elif a == A or b == A: c = 'r' else: c = 'black' scatPlot[(a, b)].append((rms[a] - rms[b], c)) obs_list.append(labels) logging.info('Bin Distribution:') grpby = {} for llist in obs_list: for l in llist: if l not in grpby: grpby[l] = 0 grpby[l] += 1 for k in sorted(grpby.keys()): logging.info('%s: %5d', k, grpby[k]) for A in range(0, numLabels-1): for B in range(A+1, numLabels): d = diffList[(A, B)] logging.info('Diff list for %d,%d: %d, %5.2f, %5.2f', A, B, len(d), min(d), max(d)) # # 6. Apply Heuristics Labeling # # logging.debug('Applying Labeling Heuristic. Origin: %d, %d', srcA, srcB) # rmslabel = [] # # label_count = {ab: 0 for ab in binlist} # groupbystate = [[] for i in range(numLabels)] # groupbybin = {ab: [] for ab in binlist} # For each frame in each traj: ID labeled well pts & build avg op logging.info('Selecting observed Well States') coor_sum = {i: np.zeros(shape=shape) for i in range(numLabels)} coor_tot = {i: 0 for i in range(numLabels)} for job, obslist in zip(exec_sim, obs_list): # offset = int(job['xid:start']) # for i, frame in enumerate(job['alpha']): for frame, label in zip(job['alpha'], obslist): # A, B = eval(obslist[offset+i]) A, B = label if A != B: continue coor_sum[A] += frame coor_tot[A] += 1 logging.info('Calculating Avg from following stats:') logging.info(' Total Frames: %d', sum([len(sim['alpha']) for sim in exec_sim])) # Calculate New Centroids (w/deltas) delta = [] for S in range(numLabels): if coor_tot[S] == 0: logging.info(" State: %d --- NO OBSERVATIONS IN THIS WELL STATE", S) continue cent_local = coor_sum[S] / coor_tot[S] diff_local = LA.norm(centroid[S] - cent_local) update = ((centroid[S] * cent_npts[S]) + (cent_local * coor_tot[S])) / (cent_npts[S] + coor_tot[S]) delta.append(LA.norm(update - centroid[S])) logging.info(' State %d: NewPts=%5d Delta=%5.2f LocalDiff=%5.2f', S, coor_tot[S], delta[-1], diff_local) centroid[S] = update cent_npts[S] += coor_tot[S] centroid_change = np.mean(delta) if len(C_delta) > 1: rel_change = np.abs((centroid_change - C_delta[-1]) / C_delta[-1]) logging.info('Centroid Change: %5.2f (%5.2f%%)', centroid_change, 100*rel_change) C_delta.append(centroid_change) batch += batch_size # Update Thetas (usig global data ?????) delta = [] for A in range(0, numLabels-1): for B in range(A+1, numLabels): X = sorted(diffList[(A, B)]) if len(X) < 100: logging.info('Lacking data on %d, %d', A, B) continue # logging.info(' Total # Obs: %d', len(X)) crossover = 0 for i, x in enumerate(X): if x > 0: crossover = i break # logging.info(' Crossover at Index: %d', crossover) if crossover < 50 or (len(X)-crossover) < 50: logging.info(' Lacking local data skipping.') continue # Find local max gradient (among 50% of points) if use_gradient: thetas_updated = np.copy(thetas) zoneA = int((1-trans_factor) * crossover) zoneB = crossover + int(trans_factor * (len(X) - crossover)) gradA = zoneA + np.argmax(np.gradient(X[zoneA:crossover])) gradB = crossover + np.argmax(np.gradient(X[crossover:zoneB])) thetaA = X[gradA] thetaB = X[gradB] thetas_updated[A][B] = np.abs(thetaA) thetas_updated[B][A] = np.abs(thetaB) tdeltA = np.abs(thetas_updated[A][B] - thetas[A][B]) tdeltB = np.abs(thetas_updated[B][A] - thetas[B][A]) delta.append(tdeltA) delta.append(tdeltB) logging.info(' Theta Change (%d,%d): %4.2f (%4.1f)', A, B, tdeltA, (100*tdeltA/thetas[A][B])) logging.info(' Theta Change (%d,%d): %4.2f (%4.1f)', B, A, tdeltB, (100*tdeltB/thetas[B][A])) thetas[A][B] = thetas_updated[A][B] thetas[B][A] = thetas_updated[B][A] else: # Classify Fixed Percent of observations as Transitional thetas_updated = np.copy(thetas) transitionPtA = int((1-trans_factor) * crossover) transitionPtB = crossover + int(trans_factor * (len(X) - crossover)) thetaA = X[transitionPtA] thetaB = X[transitionPtB] thetas_updated[A][B] = np.abs(thetaA) thetas_updated[B][A] = np.abs(thetaB) tdeltA = np.abs(thetas_updated[A][B] - thetas[A][B]) tdeltB = np.abs(thetas_updated[B][A] - thetas[B][A]) delta.append(tdeltA) delta.append(tdeltB) logging.info(' Theta Change (%d,%d): %4.2f (%4.1f)', A, B, tdeltA, (100*tdeltA/thetas[A][B])) logging.info(' Theta Change (%d,%d): %4.2f (%4.1f)', B, A, tdeltB, (100*tdeltB/thetas[B][A])) thetas[A][B] = thetas_updated[A][B] thetas[B][A] = thetas_updated[B][A] T_delta.append(np.mean(delta)) P.line(np.array(C_delta), 'Avg_CHANGE_Centroid_Pos_%s' % name) P.line(np.array(T_delta), 'Avg_CHANGE_Theta_Val_%s' % name) P.bargraph_simple(obs_count, 'Final_Histogram_%s' % name) # for k, X in diffList.items(): # A, B = k # P.transition_line(sorted(X), A, B, title='-X', trans_factor=.5) # for k, X in transList.items(): # A, B = k # P.transition_line(sorted(X), A, B, title='-ALL', trans_factor=.5) for k, X in scatPlot.items(): collab = {'b': 'Well', 'g': 'Trans', 'r': 'Primary', 'brown': 'Secondary', 'black': 'None'} ptmap = {k: [] for k in collab.keys()} ordpts = sorted(X, key = lambda x : x[0]) for i, tup in enumerate(ordpts): y, c = tup ptmap[c].append((i, y)) # if c == 'b' or c == 'g': # ptmap[c].append((i, y)) # else: # ptmap[c].append((i, 0)) A, B = k P.scat_Transtions(ptmap, title='-%d_%d'%(A,B), size=1, labels=collab)
def execute(self): """Special execute function for the reweight operator -- check/validate. """ # PRE-PROCESSING --------------------------------------------------------------------------------- logging.debug( "============================ <PRE-PROCESS> =============================" ) self.cacheclient = CacheClient(self.name) numLabels = 5 binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)] labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1) num_pts = len(labeled_pts_rms) logging.debug('##NUM_OBS: %d', num_pts) # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)] TEST_TBIN = [(2, 0), (4, 2), (2, 2), (4, 1), (3, 1), (4, 4), (0, 4), (0, 2), (0, 1)] MAX_SAMPLE_SIZE = 100 # Max # of cov traj to back project per HCube MAX_PT_PER_MATRIX = 100 # Num points to sample from each cov traj COVAR_SIZE = 200 # Ea Cov "pt" is 200 HD pts. -- should be static based on user query MAX_HCUBE = 6 # Max Num HCubes to process # IMPLEMENT USER QUERY with REWEIGHTING: logging.debug( "======================= <QUERY PROCESSING> =========================" ) # 1. RUN KPCA on <<?????>> (sample set) and project all pts # 2. Calculate K-D Tree on above # 3. Score each point with distance to centroid # 4. B = Select the smallest half of clusters # 5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA) # 6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????) # ALT-> use HCUbe size as its weight # 7. A = HCubes for states 3 (and 4) # 8. Reweight A into both state 3 and state 4 (B) HCubes # 9. ID Overlap # 10. Apply Gamme Function logging.info("===== Covariance Matrix PCA-KMeans Calculation (B)") logging.info("Retrieving All Covariance Vectors") home = os.getenv('HOME') cfile = home + '/work/DEBUG_COVAR_PTS' DO_COVAR = self.calc_covar # For recalculating covariance matrices (if not pre-calc/stored) if DO_COVAR: if os.path.exists(cfile + '.npy'): covar_pts = np.load(cfile + '.npy') logging.debug('Loaded From File') else: covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1) covar_pts = np.array([np.fromstring(x) for x in covar_raw]) np.save(cfile, covar_pts) logging.debug('Loaded From Catalog & Saved') covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1) logging.debug('Indiced Loaded. Retrieving File Indices') covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1) if DO_COVAR: logging.info(" Pulled %d Covariance Vectors", len(covar_pts)) logging.info( "Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)" ) # FOR incrementatl PCA: NUM_PC = 6 ipca_key = 'subspace:covar:ipca' ipca = PCAnalyzer.load(self.catalog, ipca_key) if ipca is None: logging.info('Creating a NEW IPCA') ipca = PCAIncremental(NUM_PC) lastindex = 0 else: lastindex = ipca.trainsize logging.info( 'IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', ipca.trainsize, len(covar_pts) - ipca.trainsize) # For incremental, partial solve using only newer pts (from the last "trainsize") if len(covar_pts) - lastindex > 0: ipca.solve(covar_pts[lastindex:]) logging.info("Incrementatl PCA Updated. Storing Now...") #### BARRIER self.wait_catalog() ipca.store(self.catalog, ipca_key) logging.info("IPCA Saved. Projecting Covariance to PC") cfile = home + '/work/DEBUG_SUBCOVAR_PTS' if os.path.exists(cfile + '.npy'): subspace_covar_pts = np.load(cfile + '.npy') else: subspace_covar_pts = ipca.project(covar_pts) np.save(cfile, subspace_covar_pts) # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points logging.info( 'Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts)) global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle') if MAX_HCUBE <= 0: hcube_global = global_kdtree.getleaves() else: # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES hcube_global_ALL = global_kdtree.getleaves() hcube_global = {} num = 0 for k, v in hcube_global_ALL.items(): hcube_global[k] = v num += 1 if num == MAX_HCUBE: break # hcube_global = global_kdtree.getleaves() logging.info( 'Global HCubes: Key Count Volume Density (NOTE DEBUGGING ONLY 3 USED)' ) for k in sorted(hcube_global.keys()): v = hcube_global[k] logging.info('%-10s %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density']) if self.filelog: keys = hcube_global.keys() self.filelog.info('global,keys,%s', ','.join(keys)) self.filelog.info( 'global,count,%s', ','.join([str(hcube_global[k]['count']) for k in keys])) self.filelog.info( 'global,volume,%s', ','.join([str(hcube_global[k]['volume']) for k in keys])) self.filelog.info( 'global,density,%s', ','.join([str(hcube_global[k]['density']) for k in keys])) logging.info( "===== SELECT Sampling of points from each Global HCube (B)") s = sorted(hcube_global.items(), key=lambda x: x[1]['count']) hcube_global = {x[0]: x[1] for x in s} counter = 0 for key in hcube_global.keys(): counter += 1 if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE: cov_index = hcube_global[key]['elm'] hcube_global[key]['samplefactor'] = 1 else: cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE) hcube_global[key]['samplefactor'] = len( hcube_global[key]['elm']) / MAX_SAMPLE_SIZE hcube_global[key]['idxlist'] = [] for cov in cov_index: selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist() hcube_global[key]['idxlist'].extend( [int(covar_index[cov]) + i for i in selected_hd_idx]) logging.info('Back Projecting Global HCube `%s` (%d out of %d)', key, counter, len(hcube_global.keys())) source_cov = self.backProjection(hcube_global[key]['idxlist']) hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov) logging.debug('Back Projected %d points to HD space: %s', len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha'])) # logging.info('Calculating all HD Distances') # dist_hd = {} # dist_ld = {} # for key in hcube_global.keys(): # T = hcube_global[key]['alpha'].xyz # N = len(T) # dist_hd[key] = np.zeros(shape=(N, N)) # dist_ld[key] = {} # for A in range(0, N): # dist_hd[key][A][A] = 0 # for B in range(A+1, N): # dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B]) # KD Tree for states from Reservoir Sample of RMSD labeled HighDim reservoir = ReservoirSample('rms', self.catalog) logging.info( "===== BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) " ) hcube_list = {} logging.info( "Scanning current set of observed bins and finding all smallest with data (excluding largest 2)" ) hcube_local = {} logging.info("=======================================================") logging.info(" PROJECT Global HCubes into Per-Bin HCube KD Tree(s)") logging.info( "=======================================================\n") overlap_hcube = {k: {} for k in hcube_global.keys()} projection_map = {} pt_projection_list = [] for key in sorted(hcube_global.keys()): for i in range(len(hcube_global[key]['alpha'].xyz)): pt_projection_list.append([]) for bin_idx, tbin in enumerate(TEST_TBIN): logging.info("Project Global HCubes into local subspace for %s", str(tbin)) # Load Vectors logging.info('Loading subspace and kernel for bin %s', str(tbin)) # LOAD KPCA Kernel matrix kpca_key = 'subspace:pca:kernel:%d_%d' % tbin kpca = PCAnalyzer.load(self.catalog, kpca_key) data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1) data = np.array([np.fromstring(x) for x in data_raw]) if len(data) == 0: logging.error( 'No Raw PCA data points for bin %s.... Going to next bin', str(tbin)) continue logging.info( 'Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape)) kdtree = KDTree(200, maxdepth=8, data=data, method='middle') hcube_local[tbin] = kdtree.getleaves() logging.info('LOCAL KD-Tree Completed for %s:', str(tbin)) for k in sorted(hcube_local[tbin].keys()): logging.info(' `%-9s` #pts:%6d density:%9.1f', k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density']) if self.filelog: keys = hcube_local[tbin].keys() A, B = tbin self.filelog.info('local,%d_%d,keys,%s', A, B, ','.join(keys)) self.filelog.info( 'local,%d_%d,count,%s', A, B, ','.join( [str(hcube_local[tbin][k]['count']) for k in keys])) self.filelog.info( 'local,%d_%d,volume,%s', A, B, ','.join( [str(hcube_local[tbin][k]['volume']) for k in keys])) self.filelog.info( 'local,%d_%d,density,%s', A, B, ','.join( [str(hcube_local[tbin][k]['density']) for k in keys])) n_total = 0 logging.debug('Global Hcubes to Project (%d): %s', len(hcube_global.keys()), str(hcube_global.keys())) projection_map[bin_idx] = { k: set() for k in hcube_local[tbin].keys() } pnum = 0 for key in sorted(hcube_global.keys()): overlap_hcube[key][tbin] = {} cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz) logging.debug( 'PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s ', key, len(cov_proj_pca), str(tbin)) for i, pt in enumerate(cov_proj_pca): hcube = kdtree.probe(pt, probedepth=9) # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying) if hcube not in overlap_hcube[key][tbin]: overlap_hcube[key][tbin][hcube] = { 'idxlist': hcube_local[tbin][hcube]['elm'], 'wgt': hcube_local[tbin][hcube]['density'], 'num_projected': 0 } overlap_hcube[key][tbin][hcube]['num_projected'] += 1 # Index this point in corresponding local HCube projection view projection_map[bin_idx][hcube].add(pnum) pt_projection_list[pnum].append(hcube) pnum += 1 for k, v in sorted(overlap_hcube[key][tbin].items()): logging.debug( ' Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected']) # logging.info('Calculating Lower Dimensional Distances') # N = len(cov_proj_pca) # dist_ld[key][tbin] = np.zeros(shape=(N, N)) # for A in range(0, N): # for B in range(A+1, N): # dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B]) # Re-Index projected points -- could make this a list too next_index = 0 view_list = [] for bin_idx, hcube_map in projection_map.items(): hcube_list = [] for hcube_key, pt_list in hcube_map.items(): hcube_list.append((set((hcube_key, )), set(pt_list))) view_list.append((set((bin_idx, )), hcube_list)) print("CALLING: Collapse Join") joined_subspaces = collapse_join(projection_map.keys(), view_list) for subspace_list, correlated_hcubes in joined_subspaces: tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list] for hcube_list, pt_list in correlated_hcubes: print(tbin_list, hcube_list, pt_list) # TODO: Corrlate Back to Global print('Visualize HERE') # for idx, tbin in enumerate(TEST_TBIN): # # Only process substates with data # if tbin not in hcube_local: # logging.warning('Local KD Tree not created for %s', str(tbin)) # continue # projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # sets = {} # proj_bin_list = [] # for tbin in TEST_TBIN: # if tbin not in hcube_local: # continue # proj_bin_list.append(tbin) # sets[tbin] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # set_list = {} # for tbin, view in sets.items(): # set_list[(tbin,)] = [] # for hcube, idxlist in view.items(): # print(tbin, hcube, idxlist) # set_list[(tbin,)].append((set((hcube,)), idxlist)) # def collapse(C): # a = 0 # b = 0 # N = [] # while a < len(C) and b < len(C): # A = sorted(C[a]) # B = sorted(C[b]) # if A == B: # b += 1 # elif A[0] == B[0]: # N.append(set(A)|set(B)) # b += 1 # else: # a += 1 # if len(N) <= 1: # return [] # else: # return N + collapse(N) # q=collapse(t1) # for i in q: print(sorted(i)) # print('Checking all 2-Way Joins') # join2 = {} # for a in range(0, len(proj_bin_list)-1): # tA = proj_bin_list[a] # for b in range(a+1, len(proj_bin_list)): # tB = proj_bin_list[b] # join_ss = tuple(set((tA, tB))) # set_list = [] # for kA, vA in sets[tA].items(): # for kB, vB in sets[tB].items(): # join_hc = set((kA, kB)) # inter = vA & vB # if len(inter) > 0: # set_list.append((join_hc, inter)) # if len(set_list) > 0: # join2[join_ss] = set_list # print('2-Way Join Results:') # for ss, set_list in join2.items(): # for hc, idxlist in set_list: # print(ss, hc, idxlist) # print('Checking all 3-Way Joins') # join3 = [] # checked = [] # for a in range(0, len(join2)-1): # sA, hA, vA = join2[a] # for b in range(a+1, len(join2)): # sB, hB, vB = join2[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join3.append((ss, hc, inter)) # print('Checking all 4-Way Joins') # join4 = [] # checked = [] # for a in range(0, len(join3)-1): # sA, hA, vA = join3[a] # for b in range(a+1, len(join3)): # sB, hB, vB = join3[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join4.append((ss, hc, inter)) # if self.filelog: # for i in join2: # self.filelog.info('%s', str(i)) # for i in join3: # self.filelog.info('%s', str(i)) # for i in join4: # self.filelog.info('%s', str(i)) DO_MIN_CHECK = False if DO_MIN_CHECK: def maxcount(x): y = {} for i in x: y[i] = 1 if i not in y else y[i] + 1 return max(y.values()) print( '%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces' ) argmin_nonzero = lambda x: np.argmin([(i if i > 0 else np.inf) for i in x]) for key in hcube_global.keys(): # logging.info('Showing MIN / MAX for points from HCube %s:', key) minA = {} maxA = {} for n in range(len(dist_hd[key])): minA[n] = [] maxA[n] = [] for tbin in TEST_TBIN: if tbin not in dist_ld[key].keys(): continue minA[n].append(0) maxA[n].append(0) else: minA[n].append( argmin_nonzero(dist_ld[key][tbin][n])) maxA[n].append(np.argmax(dist_ld[key][tbin][n])) numsame = np.zeros(len(dist_ld[key].keys()) + 1) for n in range(len(dist_hd[key][n])): minH = argmin_nonzero(dist_hd[key][n]) maxH = np.argmax(dist_hd[key][n]) minmax = ['%2d/%-2d' % i for i in zip(minA[n], maxA[n])] numsamepair = maxcount(minA[n]) numsame[numsamepair] += 1 # print('%3d'%n, '%2d/%-2d '%(minH, maxH), '%s' % ' '.join(minmax), ' [%d]'%numsamepair) print(' '.join([ '%4.1f%%' % i for i in (100 * (numsame / np.sum(numsame))) ])) print('Stopping HERE!') sys.exit(0) # GAMMA FUNCTION EXPR # 8 gamma1 = lambda a, b: (a * b) gamma2 = lambda a, b: (a + b) / 2 # TODO: Factor in RMS weight for tbin in TEST_TBIN: # for tbin in sorted(bin_list): logging.info('') logging.info('BIPARTITE GRAPH for %s', str(tbin)) bipart = {} edgelist = [] for hcB in hcube_global.keys(): num_B = hcube_global[hcB]['count'] wgt1_B = hcube_global[hcB]['density'] if tbin not in overlap_hcube[hcB]: continue for hcA, hcA_data in overlap_hcube[hcB][tbin].items(): edge = {} if hcA not in bipart: bipart[hcA] = [] num_proj = hcA_data['num_projected'] wgt_A = hcA_data['wgt'] wgt2_B = wgt1_B * num_proj edge['combW1'] = gamma1(wgt_A, wgt1_B) edge['combW2'] = gamma1(wgt_A, wgt2_B) edge['combW3'] = gamma2(wgt_A, wgt1_B) edge['combW4'] = gamma2(wgt_A, wgt2_B) edge['num_A'] = len(hcA_data['idxlist']) edge['num_B'] = num_B edge['num_proj'] = num_proj edge['wgt_A'] = wgt_A edge['wgt1_B'] = wgt1_B edge['wgt2_B'] = wgt2_B edge['hcA'] = hcA edge['hcB'] = hcB bipart[hcA].append(edge) edgelist.append((hcA, hcB, num_proj)) if len(bipart) == 0: logging.info("NO DATA FOR %s", str(tbin)) continue logging.info('') logging.info( 'A (# Pts) H-Cube <--- B H-Cube (# proj/total Pts) wgt_A wB1:density wB2:Mass A*B1 A*B2 AVG(A,B1) AVG(A,B2)' ) for k, v in bipart.items(): for edge in v: logging.info( 'A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s` (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge) if self.filelog: A, B = tbin self.filelog.info('edge,%d_%d,%s,%s,%d', A, B, edge['hcA'], edge['hcB'], edge['num_proj']) # Prepare nodes for graph nA = set() nB = set() elist = [] for e in edgelist: a, b, z = e if z <= 5: continue nA.add(a) nB.add(b) elist.append((a, b, z)) nAKeys = sorted(nA)[::-1] nBKeys = sorted(nB)[::-1] sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys] sizesB = [hcube_global[n]['count'] * 3 for n in nBKeys] idxA = {key: i for i, key in enumerate(nAKeys)} idxB = {key: i for i, key in enumerate(nBKeys)} edges = [(idxA[a], idxB[b], z) for a, b, z in elist] G.bipartite(sizesA, sizesB, edges, sizesA, sizesB, 'bipartite_%d_%d' % tbin) logging.info('STOPPING HERE!!!!') sys.exit(0) return []
def execute(self): """Special execute function for the reweight operator -- check/validate. """ # PRE-PROCESSING --------------------------------------------------------------------------------- logging.debug("============================ <PRE-PROCESS> =============================") self.cacheclient = CacheClient(self.name) numLabels = 5 binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)] labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1) num_pts = len(labeled_pts_rms) logging.debug('##NUM_OBS: %d', num_pts) # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)] TEST_TBIN = [(2,0), (4,2), (2,2), (4,1), (3,1), (4,4), (0,4), (0,2), (0,1)] MAX_SAMPLE_SIZE = 100 # Max # of cov traj to back project per HCube MAX_PT_PER_MATRIX = 100 # Num points to sample from each cov traj COVAR_SIZE = 200 # Ea Cov "pt" is 200 HD pts. -- should be static based on user query MAX_HCUBE = 6 # Max Num HCubes to process # IMPLEMENT USER QUERY with REWEIGHTING: logging.debug("======================= <QUERY PROCESSING> =========================") # 1. RUN KPCA on <<?????>> (sample set) and project all pts # 2. Calculate K-D Tree on above # 3. Score each point with distance to centroid # 4. B = Select the smallest half of clusters # 5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA) # 6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????) # ALT-> use HCUbe size as its weight # 7. A = HCubes for states 3 (and 4) # 8. Reweight A into both state 3 and state 4 (B) HCubes # 9. ID Overlap # 10. Apply Gamme Function logging.info("===== Covariance Matrix PCA-KMeans Calculation (B)") logging.info("Retrieving All Covariance Vectors") home = os.getenv('HOME') cfile = home + '/work/DEBUG_COVAR_PTS' DO_COVAR = self.calc_covar # For recalculating covariance matrices (if not pre-calc/stored) if DO_COVAR: if os.path.exists(cfile + '.npy'): covar_pts = np.load(cfile + '.npy') logging.debug('Loaded From File') else: covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1) covar_pts = np.array([np.fromstring(x) for x in covar_raw]) np.save(cfile, covar_pts) logging.debug('Loaded From Catalog & Saved') covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1) logging.debug('Indiced Loaded. Retrieving File Indices') covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1) if DO_COVAR: logging.info(" Pulled %d Covariance Vectors", len(covar_pts)) logging.info("Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)") # FOR incrementatl PCA: NUM_PC = 6 ipca_key = 'subspace:covar:ipca' ipca = PCAnalyzer.load(self.catalog, ipca_key) if ipca is None: logging.info('Creating a NEW IPCA') ipca = PCAIncremental(NUM_PC) lastindex = 0 else: lastindex = ipca.trainsize logging.info('IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', ipca.trainsize, len(covar_pts)-ipca.trainsize) # For incremental, partial solve using only newer pts (from the last "trainsize") if len(covar_pts)-lastindex > 0: ipca.solve(covar_pts[lastindex:]) logging.info("Incrementatl PCA Updated. Storing Now...") #### BARRIER self.wait_catalog() ipca.store(self.catalog, ipca_key) logging.info("IPCA Saved. Projecting Covariance to PC") cfile = home + '/work/DEBUG_SUBCOVAR_PTS' if os.path.exists(cfile + '.npy'): subspace_covar_pts = np.load(cfile + '.npy') else: subspace_covar_pts = ipca.project(covar_pts) np.save(cfile, subspace_covar_pts) # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points logging.info('Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts)) global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle') if MAX_HCUBE <= 0: hcube_global = global_kdtree.getleaves() else: # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES hcube_global_ALL = global_kdtree.getleaves() hcube_global = {} num = 0 for k, v in hcube_global_ALL.items(): hcube_global[k] = v num += 1 if num == MAX_HCUBE: break # hcube_global = global_kdtree.getleaves() logging.info('Global HCubes: Key Count Volume Density (NOTE DEBUGGING ONLY 3 USED)') for k in sorted(hcube_global.keys()): v = hcube_global[k] logging.info('%-10s %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density']) if self.filelog: keys = hcube_global.keys() self.filelog.info('global,keys,%s',','.join(keys)) self.filelog.info('global,count,%s',','.join([str(hcube_global[k]['count']) for k in keys])) self.filelog.info('global,volume,%s',','.join([str(hcube_global[k]['volume']) for k in keys])) self.filelog.info('global,density,%s',','.join([str(hcube_global[k]['density']) for k in keys])) logging.info("===== SELECT Sampling of points from each Global HCube (B)") s = sorted(hcube_global.items(), key=lambda x: x[1]['count']) hcube_global = {x[0]: x[1] for x in s} counter = 0 for key in hcube_global.keys(): counter += 1 if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE: cov_index = hcube_global[key]['elm'] hcube_global[key]['samplefactor'] = 1 else: cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE) hcube_global[key]['samplefactor'] = len(hcube_global[key]['elm']) / MAX_SAMPLE_SIZE hcube_global[key]['idxlist'] = [] for cov in cov_index: selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist() hcube_global[key]['idxlist'].extend([int(covar_index[cov]) + i for i in selected_hd_idx]) logging.info('Back Projecting Global HCube `%s` (%d out of %d)', key, counter, len(hcube_global.keys())) source_cov = self.backProjection(hcube_global[key]['idxlist']) hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov) logging.debug('Back Projected %d points to HD space: %s', len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha'])) # logging.info('Calculating all HD Distances') # dist_hd = {} # dist_ld = {} # for key in hcube_global.keys(): # T = hcube_global[key]['alpha'].xyz # N = len(T) # dist_hd[key] = np.zeros(shape=(N, N)) # dist_ld[key] = {} # for A in range(0, N): # dist_hd[key][A][A] = 0 # for B in range(A+1, N): # dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B]) # KD Tree for states from Reservoir Sample of RMSD labeled HighDim reservoir = ReservoirSample('rms', self.catalog) logging.info("===== BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) ") hcube_list = {} logging.info("Scanning current set of observed bins and finding all smallest with data (excluding largest 2)") hcube_local = {} logging.info("=======================================================") logging.info(" PROJECT Global HCubes into Per-Bin HCube KD Tree(s)") logging.info("=======================================================\n") overlap_hcube = {k: {} for k in hcube_global.keys()} projection_map = {} pt_projection_list = [] for key in sorted(hcube_global.keys()): for i in range(len(hcube_global[key]['alpha'].xyz)): pt_projection_list.append([]) for bin_idx, tbin in enumerate(TEST_TBIN): logging.info("Project Global HCubes into local subspace for %s", str(tbin)) # Load Vectors logging.info('Loading subspace and kernel for bin %s', str(tbin)) # LOAD KPCA Kernel matrix kpca_key = 'subspace:pca:kernel:%d_%d' % tbin kpca = PCAnalyzer.load(self.catalog, kpca_key) data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1) data = np.array([np.fromstring(x) for x in data_raw]) if len(data) == 0: logging.error('No Raw PCA data points for bin %s.... Going to next bin', str(tbin)) continue logging.info('Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape)) kdtree = KDTree(200, maxdepth=8, data=data, method='middle') hcube_local[tbin] = kdtree.getleaves() logging.info('LOCAL KD-Tree Completed for %s:', str(tbin)) for k in sorted(hcube_local[tbin].keys()): logging.info(' `%-9s` #pts:%6d density:%9.1f', k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density']) if self.filelog: keys = hcube_local[tbin].keys() A,B = tbin self.filelog.info('local,%d_%d,keys,%s',A,B,','.join(keys)) self.filelog.info('local,%d_%d,count,%s',A,B,','.join([str(hcube_local[tbin][k]['count']) for k in keys])) self.filelog.info('local,%d_%d,volume,%s',A,B,','.join([str(hcube_local[tbin][k]['volume']) for k in keys])) self.filelog.info('local,%d_%d,density,%s',A,B,','.join([str(hcube_local[tbin][k]['density']) for k in keys])) n_total = 0 logging.debug('Global Hcubes to Project (%d): %s', len(hcube_global.keys()), str(hcube_global.keys())) projection_map[bin_idx] = {k: set() for k in hcube_local[tbin].keys()} pnum = 0 for key in sorted(hcube_global.keys()): overlap_hcube[key][tbin] = {} cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz) logging.debug('PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s ', key, len(cov_proj_pca), str(tbin)) for i, pt in enumerate(cov_proj_pca): hcube = kdtree.probe(pt, probedepth=9) # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying) if hcube not in overlap_hcube[key][tbin]: overlap_hcube[key][tbin][hcube] = { 'idxlist': hcube_local[tbin][hcube]['elm'], 'wgt': hcube_local[tbin][hcube]['density'], 'num_projected': 0} overlap_hcube[key][tbin][hcube]['num_projected'] += 1 # Index this point in corresponding local HCube projection view projection_map[bin_idx][hcube].add(pnum) pt_projection_list[pnum].append(hcube) pnum += 1 for k, v in sorted(overlap_hcube[key][tbin].items()): logging.debug(' Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected']) # logging.info('Calculating Lower Dimensional Distances') # N = len(cov_proj_pca) # dist_ld[key][tbin] = np.zeros(shape=(N, N)) # for A in range(0, N): # for B in range(A+1, N): # dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B]) # Re-Index projected points -- could make this a list too next_index = 0 view_list = [] for bin_idx, hcube_map in projection_map.items(): hcube_list = [] for hcube_key, pt_list in hcube_map.items(): hcube_list.append((set((hcube_key,)), set(pt_list))) view_list.append((set((bin_idx,)), hcube_list)) print("CALLING: Collapse Join") joined_subspaces = collapse_join(projection_map.keys(), view_list) for subspace_list, correlated_hcubes in joined_subspaces: tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list] for hcube_list, pt_list in correlated_hcubes: print(tbin_list, hcube_list, pt_list) # TODO: Corrlate Back to Global print('Visualize HERE') # for idx, tbin in enumerate(TEST_TBIN): # # Only process substates with data # if tbin not in hcube_local: # logging.warning('Local KD Tree not created for %s', str(tbin)) # continue # projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # sets = {} # proj_bin_list = [] # for tbin in TEST_TBIN: # if tbin not in hcube_local: # continue # proj_bin_list.append(tbin) # sets[tbin] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # set_list = {} # for tbin, view in sets.items(): # set_list[(tbin,)] = [] # for hcube, idxlist in view.items(): # print(tbin, hcube, idxlist) # set_list[(tbin,)].append((set((hcube,)), idxlist)) # def collapse(C): # a = 0 # b = 0 # N = [] # while a < len(C) and b < len(C): # A = sorted(C[a]) # B = sorted(C[b]) # if A == B: # b += 1 # elif A[0] == B[0]: # N.append(set(A)|set(B)) # b += 1 # else: # a += 1 # if len(N) <= 1: # return [] # else: # return N + collapse(N) # q=collapse(t1) # for i in q: print(sorted(i)) # print('Checking all 2-Way Joins') # join2 = {} # for a in range(0, len(proj_bin_list)-1): # tA = proj_bin_list[a] # for b in range(a+1, len(proj_bin_list)): # tB = proj_bin_list[b] # join_ss = tuple(set((tA, tB))) # set_list = [] # for kA, vA in sets[tA].items(): # for kB, vB in sets[tB].items(): # join_hc = set((kA, kB)) # inter = vA & vB # if len(inter) > 0: # set_list.append((join_hc, inter)) # if len(set_list) > 0: # join2[join_ss] = set_list # print('2-Way Join Results:') # for ss, set_list in join2.items(): # for hc, idxlist in set_list: # print(ss, hc, idxlist) # print('Checking all 3-Way Joins') # join3 = [] # checked = [] # for a in range(0, len(join2)-1): # sA, hA, vA = join2[a] # for b in range(a+1, len(join2)): # sB, hB, vB = join2[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join3.append((ss, hc, inter)) # print('Checking all 4-Way Joins') # join4 = [] # checked = [] # for a in range(0, len(join3)-1): # sA, hA, vA = join3[a] # for b in range(a+1, len(join3)): # sB, hB, vB = join3[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join4.append((ss, hc, inter)) # if self.filelog: # for i in join2: # self.filelog.info('%s', str(i)) # for i in join3: # self.filelog.info('%s', str(i)) # for i in join4: # self.filelog.info('%s', str(i)) DO_MIN_CHECK = False if DO_MIN_CHECK: def maxcount(x): y={} for i in x: y[i] = 1 if i not in y else y[i]+1 return max(y.values()) print('%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces') argmin_nonzero = lambda x: np.argmin([(i if i>0 else np.inf) for i in x]) for key in hcube_global.keys(): # logging.info('Showing MIN / MAX for points from HCube %s:', key) minA = {}; maxA={} for n in range(len(dist_hd[key])) : minA[n]=[] ; maxA[n]=[] for tbin in TEST_TBIN: if tbin not in dist_ld[key].keys(): continue minA[n].append(0) maxA[n].append(0) else: minA[n].append(argmin_nonzero(dist_ld[key][tbin][n])) maxA[n].append(np.argmax(dist_ld[key][tbin][n])) numsame = np.zeros(len(dist_ld[key].keys())+1) for n in range(len(dist_hd[key][n])): minH = argmin_nonzero(dist_hd[key][n]) maxH = np.argmax(dist_hd[key][n]) minmax = ['%2d/%-2d'%i for i in zip(minA[n], maxA[n])] numsamepair = maxcount(minA[n]) numsame[numsamepair] += 1 # print('%3d'%n, '%2d/%-2d '%(minH, maxH), '%s' % ' '.join(minmax), ' [%d]'%numsamepair) print(' '.join(['%4.1f%%'%i for i in (100* (numsame/np.sum(numsame)))])) print('Stopping HERE!') sys.exit(0) # GAMMA FUNCTION EXPR # 8 gamma1 = lambda a, b : (a * b) gamma2 = lambda a, b : (a + b) / 2 # TODO: Factor in RMS weight for tbin in TEST_TBIN: # for tbin in sorted(bin_list): logging.info('') logging.info('BIPARTITE GRAPH for %s', str(tbin)) bipart = {} edgelist = [] for hcB in hcube_global.keys(): num_B = hcube_global[hcB]['count'] wgt1_B = hcube_global[hcB]['density'] if tbin not in overlap_hcube[hcB]: continue for hcA, hcA_data in overlap_hcube[hcB][tbin].items(): edge = {} if hcA not in bipart: bipart[hcA] = [] num_proj = hcA_data['num_projected'] wgt_A = hcA_data['wgt'] wgt2_B = wgt1_B*num_proj edge['combW1'] = gamma1(wgt_A, wgt1_B) edge['combW2'] = gamma1(wgt_A, wgt2_B) edge['combW3'] = gamma2(wgt_A, wgt1_B) edge['combW4'] = gamma2(wgt_A, wgt2_B) edge['num_A'] = len(hcA_data['idxlist']) edge['num_B'] = num_B edge['num_proj'] = num_proj edge['wgt_A'] = wgt_A edge['wgt1_B'] = wgt1_B edge['wgt2_B'] = wgt2_B edge['hcA'] = hcA edge['hcB'] = hcB bipart[hcA].append(edge) edgelist.append((hcA, hcB, num_proj)) if len(bipart) == 0: logging.info("NO DATA FOR %s", str(tbin)) continue logging.info('') logging.info('A (# Pts) H-Cube <--- B H-Cube (# proj/total Pts) wgt_A wB1:density wB2:Mass A*B1 A*B2 AVG(A,B1) AVG(A,B2)') for k, v in bipart.items(): for edge in v: logging.info('A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s` (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge) if self.filelog: A,B = tbin self.filelog.info('edge,%d_%d,%s,%s,%d',A,B,edge['hcA'],edge['hcB'],edge['num_proj']) # Prepare nodes for graph nA = set() nB = set() elist = [] for e in edgelist: a, b, z = e if z <= 5: continue nA.add(a) nB.add(b) elist.append((a,b,z)) nAKeys = sorted(nA)[::-1] nBKeys = sorted(nB)[::-1] sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys] sizesB = [hcube_global[n]['count']*3 for n in nBKeys] idxA = {key: i for i, key in enumerate(nAKeys)} idxB = {key: i for i, key in enumerate(nBKeys)} edges = [(idxA[a], idxB[b], z) for a, b, z in elist] G.bipartite(sizesA,sizesB,edges,sizesA,sizesB,'bipartite_%d_%d' % tbin) logging.info('STOPPING HERE!!!!') sys.exit(0) return []