예제 #1
0
def reproj_distro():
    local = get_local()
    data = get_edges()
    r2 = redis.StrictRedis(port=6385, decode_responses=True)
    for tbin in [(2, 4), (2, 2), (4, 4), (4, 2), (4, 1), (3, 1)]:
        print('Processing:', tbin)
        tkey = '%d_%d' % tbin
        # Get Kernel
        kpca_key = 'subspace:pca:kernel:' + tkey
        kpca = PCAnalyzer.load(r2, kpca_key)
        # Get Training Data
        data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1)
        pca_pts = np.array([np.fromstring(x) for x in data_raw])
        kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle')
        proj_pts = kpca.project(alpha.xyz)
        biased_hcubes = []
        for i, pt in enumerate(proj_pts):
            biased_hcubes.append(kdtree.probe(pt, probedepth=9))
        if len(data) == 0:
            print('No Raw PCA data points for bin %s.... Going to next bin',
                  str(tbin))
            continue
        counts = {}
        for i in biased_hcubes:
            if i not in counts:
                counts[i] = 0
            counts[i] += 1
        for i in local[tkey]['keys']:
            if i not in counts:
                counts[i] = 0
        print('check')
        cvect = [counts[i] for i in local[tkey]['keys']]
        d = np.array(cvect) / sum(cvect)
        c = np.array(data[tkey])
        lcnt = np.sum(c, axis=0)
        gcnt = np.sum(c, axis=1)
        norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis])
        # Add biased data as a col
        kpca_cnt = np.array([int(i) for i in local[tkey]['count']])
        kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt)
        arr = np.vstack((norm, kpca_cnt_norm, d)).T
        rowlist = tuple(gcnt) + (
            'localKPCA',
            'biased',
        )
        P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])
예제 #2
0
파일: scrapper.py 프로젝트: DaMSL/ddc
def reproj_distro():
  local = get_local()
  data  = get_edges()
  r2 = redis.StrictRedis(port=6385, decode_responses=True)
  for tbin in [(2,4), (2,2), (4,4), (4,2), (4,1), (3,1)]:
    print('Processing:', tbin)
    tkey = '%d_%d' % tbin
    # Get Kernel
    kpca_key = 'subspace:pca:kernel:' + tkey
    kpca = PCAnalyzer.load(r2, kpca_key)
    # Get Training Data
    data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1)
    pca_pts = np.array([np.fromstring(x) for x in data_raw])
    kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle')
    proj_pts = kpca.project(alpha.xyz)
    biased_hcubes = []
    for i, pt in enumerate(proj_pts):
      biased_hcubes.append(kdtree.probe(pt, probedepth=9))
    if len(data) == 0:
      print('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
      continue
    counts = {}
    for i in biased_hcubes:
      if i not in counts:
        counts[i] = 0
      counts[i] += 1
    for i in local[tkey]['keys']:
      if i not in counts:
        counts[i] = 0
    print('check')
    cvect = [counts[i] for i in local[tkey]['keys']]
    d = np.array(cvect)/sum(cvect)
    c = np.array(data[tkey])
    lcnt = np.sum(c, axis=0)
    gcnt = np.sum(c, axis=1)
    norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis])
    # Add biased data as a col
    kpca_cnt = np.array([int(i) for i in local[tkey]['count']])
    kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt)
    arr = np.vstack((norm, kpca_cnt_norm, d)).T
    rowlist = tuple(gcnt) + ('localKPCA', 'biased',)
    P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])
예제 #3
0
    def execute(self):
        """Special execute function for the reweight operator -- check/validate.
      """
        # PRE-PROCESSING ---------------------------------------------------------------------------------
        logging.debug(
            "============================  <PRE-PROCESS>  ============================="
        )
        self.cacheclient = CacheClient(self.name)
        numLabels = 5
        binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
        labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
        num_pts = len(labeled_pts_rms)
        logging.debug('##NUM_OBS: %d', num_pts)

        # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
        TEST_TBIN = [(2, 0), (4, 2), (2, 2), (4, 1), (3, 1), (4, 4), (0, 4),
                     (0, 2), (0, 1)]
        MAX_SAMPLE_SIZE = 100  # Max # of cov traj to back project per HCube
        MAX_PT_PER_MATRIX = 100  # Num points to sample from each cov traj
        COVAR_SIZE = 200  # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
        MAX_HCUBE = 6  # Max Num HCubes to process

        # IMPLEMENT USER QUERY with REWEIGHTING:
        logging.debug(
            "=======================  <QUERY PROCESSING>  ========================="
        )

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

        logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
        logging.info("Retrieving All Covariance Vectors")
        home = os.getenv('HOME')
        cfile = home + '/work/DEBUG_COVAR_PTS'
        DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
        if DO_COVAR:
            if os.path.exists(cfile + '.npy'):
                covar_pts = np.load(cfile + '.npy')
                logging.debug('Loaded From File')
            else:
                covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
                covar_pts = np.array([np.fromstring(x) for x in covar_raw])
                np.save(cfile, covar_pts)
                logging.debug('Loaded From Catalog & Saved')
        covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
        logging.debug('Indiced Loaded. Retrieving File Indices')
        covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

        if DO_COVAR:
            logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
            logging.info(
                "Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)"
            )

            # FOR incrementatl PCA:
            NUM_PC = 6
            ipca_key = 'subspace:covar:ipca'
            ipca = PCAnalyzer.load(self.catalog, ipca_key)
            if ipca is None:
                logging.info('Creating a NEW IPCA')
                ipca = PCAIncremental(NUM_PC)
                lastindex = 0
            else:
                lastindex = ipca.trainsize
                logging.info(
                    'IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts',
                    ipca.trainsize,
                    len(covar_pts) - ipca.trainsize)

            # For incremental, partial solve using only newer pts (from the last "trainsize")
            if len(covar_pts) - lastindex > 0:
                ipca.solve(covar_pts[lastindex:])
                logging.info("Incrementatl PCA Updated. Storing Now...")

                ####  BARRIER
                self.wait_catalog()
                ipca.store(self.catalog, ipca_key)

            logging.info("IPCA Saved. Projecting Covariance to PC")

        cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
        if os.path.exists(cfile + '.npy'):
            subspace_covar_pts = np.load(cfile + '.npy')
        else:
            subspace_covar_pts = ipca.project(covar_pts)
            np.save(cfile, subspace_covar_pts)

        # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
        logging.info(
            'Building Global KD Tree over Covar Subspace with %d data pts',
            len(subspace_covar_pts))
        global_kdtree = KDTree(250,
                               maxdepth=8,
                               data=subspace_covar_pts,
                               method='middle')

        if MAX_HCUBE <= 0:
            hcube_global = global_kdtree.getleaves()
        else:
            # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
            hcube_global_ALL = global_kdtree.getleaves()
            hcube_global = {}
            num = 0
            for k, v in hcube_global_ALL.items():
                hcube_global[k] = v
                num += 1
                if num == MAX_HCUBE:
                    break

        # hcube_global = global_kdtree.getleaves()
        logging.info(
            'Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)'
        )
        for k in sorted(hcube_global.keys()):
            v = hcube_global[k]
            logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'],
                         v['volume'], v['density'])

        if self.filelog:
            keys = hcube_global.keys()
            self.filelog.info('global,keys,%s', ','.join(keys))
            self.filelog.info(
                'global,count,%s',
                ','.join([str(hcube_global[k]['count']) for k in keys]))
            self.filelog.info(
                'global,volume,%s',
                ','.join([str(hcube_global[k]['volume']) for k in keys]))
            self.filelog.info(
                'global,density,%s',
                ','.join([str(hcube_global[k]['density']) for k in keys]))

        logging.info(
            "=====  SELECT Sampling of points from each Global HCube  (B)")
        s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
        hcube_global = {x[0]: x[1] for x in s}

        counter = 0
        for key in hcube_global.keys():
            counter += 1
            if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE:
                cov_index = hcube_global[key]['elm']
                hcube_global[key]['samplefactor'] = 1
            else:
                cov_index = np.random.choice(hcube_global[key]['elm'],
                                             MAX_SAMPLE_SIZE)
                hcube_global[key]['samplefactor'] = len(
                    hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
            hcube_global[key]['idxlist'] = []
            for cov in cov_index:
                selected_hd_idx = np.random.choice(COVAR_SIZE,
                                                   MAX_PT_PER_MATRIX).tolist()
                hcube_global[key]['idxlist'].extend(
                    [int(covar_index[cov]) + i for i in selected_hd_idx])
            logging.info('Back Projecting Global HCube `%s`  (%d out of %d)',
                         key, counter, len(hcube_global.keys()))
            source_cov = self.backProjection(hcube_global[key]['idxlist'])
            hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
            logging.debug('Back Projected %d points to HD space: %s',
                          len(hcube_global[key]['idxlist']),
                          str(hcube_global[key]['alpha']))

        # logging.info('Calculating all HD Distances')
        # dist_hd = {}
        # dist_ld = {}
        # for key in hcube_global.keys():
        #   T = hcube_global[key]['alpha'].xyz
        #   N = len(T)
        #   dist_hd[key] = np.zeros(shape=(N, N))
        #   dist_ld[key] = {}
        #   for A in range(0, N):
        #     dist_hd[key][A][A] = 0
        #     for B in range(A+1, N):
        #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
        reservoir = ReservoirSample('rms', self.catalog)

        logging.info(
            "=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) "
        )
        hcube_list = {}

        logging.info(
            "Scanning current set of observed bins and finding all smallest with data (excluding largest 2)"
        )
        hcube_local = {}

        logging.info("=======================================================")
        logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
        logging.info(
            "=======================================================\n")

        overlap_hcube = {k: {} for k in hcube_global.keys()}

        projection_map = {}

        pt_projection_list = []
        for key in sorted(hcube_global.keys()):
            for i in range(len(hcube_global[key]['alpha'].xyz)):
                pt_projection_list.append([])
        for bin_idx, tbin in enumerate(TEST_TBIN):
            logging.info("Project Global HCubes into local subspace for %s",
                         str(tbin))
            # Load Vectors
            logging.info('Loading subspace and kernel for bin %s', str(tbin))

            # LOAD KPCA Kernel matrix
            kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
            kpca = PCAnalyzer.load(self.catalog, kpca_key)

            data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
            data = np.array([np.fromstring(x) for x in data_raw])
            if len(data) == 0:
                logging.error(
                    'No Raw PCA data points for bin %s.... Going to next bin',
                    str(tbin))
                continue

            logging.info(
                'Building KDtree over local %s bin from observations matrix of size: %s',
                str(tbin), str(data.shape))
            kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
            hcube_local[tbin] = kdtree.getleaves()
            logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
            for k in sorted(hcube_local[tbin].keys()):
                logging.info('    `%-9s`   #pts:%6d   density:%9.1f', k,
                             len(hcube_local[tbin][k]['elm']),
                             hcube_local[tbin][k]['density'])

            if self.filelog:
                keys = hcube_local[tbin].keys()
                A, B = tbin
                self.filelog.info('local,%d_%d,keys,%s', A, B, ','.join(keys))
                self.filelog.info(
                    'local,%d_%d,count,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['count']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,volume,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['volume']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,density,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['density']) for k in keys]))

            n_total = 0
            logging.debug('Global Hcubes to Project (%d):  %s',
                          len(hcube_global.keys()), str(hcube_global.keys()))
            projection_map[bin_idx] = {
                k: set()
                for k in hcube_local[tbin].keys()
            }

            pnum = 0
            for key in sorted(hcube_global.keys()):
                overlap_hcube[key][tbin] = {}
                cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

                logging.debug(
                    'PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ',
                    key, len(cov_proj_pca), str(tbin))
                for i, pt in enumerate(cov_proj_pca):
                    hcube = kdtree.probe(pt, probedepth=9)
                    # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
                    if hcube not in overlap_hcube[key][tbin]:
                        overlap_hcube[key][tbin][hcube] = {
                            'idxlist': hcube_local[tbin][hcube]['elm'],
                            'wgt': hcube_local[tbin][hcube]['density'],
                            'num_projected': 0
                        }
                    overlap_hcube[key][tbin][hcube]['num_projected'] += 1

                    # Index this point in corresponding local HCube projection view
                    projection_map[bin_idx][hcube].add(pnum)

                    pt_projection_list[pnum].append(hcube)
                    pnum += 1

                for k, v in sorted(overlap_hcube[key][tbin].items()):
                    logging.debug(
                        '   Project ==> Local HCube `%-9s`: %5d points', k,
                        v['num_projected'])
                # logging.info('Calculating Lower Dimensional Distances')
                # N = len(cov_proj_pca)
                # dist_ld[key][tbin] = np.zeros(shape=(N, N))
                # for A in range(0, N):
                #   for B in range(A+1, N):
                #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])

    # Re-Index projected points -- could make this a list too

        next_index = 0
        view_list = []
        for bin_idx, hcube_map in projection_map.items():
            hcube_list = []
            for hcube_key, pt_list in hcube_map.items():
                hcube_list.append((set((hcube_key, )), set(pt_list)))
            view_list.append((set((bin_idx, )), hcube_list))

        print("CALLING: Collapse Join")
        joined_subspaces = collapse_join(projection_map.keys(), view_list)
        for subspace_list, correlated_hcubes in joined_subspaces:
            tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
            for hcube_list, pt_list in correlated_hcubes:
                print(tbin_list, hcube_list, pt_list)
                # TODO: Corrlate Back to Global
        print('Visualize HERE')

        # for idx, tbin in enumerate(TEST_TBIN):
        #   # Only process substates with data
        #   if tbin not in hcube_local:
        #     logging.warning('Local KD Tree not created for %s', str(tbin))
        #     continue
        #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # sets = {}
        # proj_bin_list = []
        # for tbin in TEST_TBIN:
        #   if tbin not in hcube_local:
        #     continue
        #   proj_bin_list.append(tbin)
        #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # set_list = {}
        # for tbin, view in sets.items():
        #   set_list[(tbin,)] = []
        #   for hcube, idxlist in view.items():
        #     print(tbin, hcube, idxlist)
        #     set_list[(tbin,)].append((set((hcube,)), idxlist))

        # def collapse(C):
        #   a = 0
        #   b = 0
        #   N = []
        #   while a < len(C) and b < len(C):
        #     A = sorted(C[a])
        #     B = sorted(C[b])
        #     if A == B:
        #       b += 1
        #     elif A[0] == B[0]:
        #       N.append(set(A)|set(B))
        #       b += 1
        #     else:
        #       a += 1
        #   if len(N) <= 1:
        #     return []
        #   else:
        #     return N + collapse(N)

        # q=collapse(t1)
        # for i in q: print(sorted(i))

        # print('Checking all 2-Way Joins')
        # join2 = {}
        # for a in range(0, len(proj_bin_list)-1):
        #   tA = proj_bin_list[a]
        #   for b in range(a+1, len(proj_bin_list)):
        #     tB = proj_bin_list[b]
        #     join_ss = tuple(set((tA, tB)))
        #     set_list = []
        #     for kA, vA in sets[tA].items():
        #       for kB, vB in sets[tB].items():
        #         join_hc = set((kA, kB))
        #         inter = vA & vB
        #         if len(inter) > 0:
        #           set_list.append((join_hc, inter))
        #     if len(set_list) > 0:
        #       join2[join_ss] = set_list
        # print('2-Way Join Results:')
        # for ss, set_list in join2.items():
        #   for hc, idxlist in set_list:
        #     print(ss, hc, idxlist)

        # print('Checking all 3-Way Joins')
        # join3 = []
        # checked = []
        # for a in range(0, len(join2)-1):
        #   sA, hA, vA = join2[a]
        #   for b in range(a+1, len(join2)):
        #     sB, hB, vB = join2[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join3.append((ss, hc, inter))

        # print('Checking all 4-Way Joins')
        # join4 = []
        # checked = []
        # for a in range(0, len(join3)-1):
        #   sA, hA, vA = join3[a]
        #   for b in range(a+1, len(join3)):
        #     sB, hB, vB = join3[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join4.append((ss, hc, inter))

        # if self.filelog:
        #   for i in join2:
        #     self.filelog.info('%s', str(i))
        #   for i in join3:
        #     self.filelog.info('%s', str(i))
        #   for i in join4:
        #     self.filelog.info('%s', str(i))

        DO_MIN_CHECK = False
        if DO_MIN_CHECK:

            def maxcount(x):
                y = {}
                for i in x:
                    y[i] = 1 if i not in y else y[i] + 1
                return max(y.values())

            print(
                '%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces'
            )
            argmin_nonzero = lambda x: np.argmin([(i if i > 0 else np.inf)
                                                  for i in x])
            for key in hcube_global.keys():
                # logging.info('Showing MIN / MAX for points from HCube %s:', key)
                minA = {}
                maxA = {}
                for n in range(len(dist_hd[key])):
                    minA[n] = []
                    maxA[n] = []
                    for tbin in TEST_TBIN:
                        if tbin not in dist_ld[key].keys():
                            continue
                            minA[n].append(0)
                            maxA[n].append(0)
                        else:
                            minA[n].append(
                                argmin_nonzero(dist_ld[key][tbin][n]))
                            maxA[n].append(np.argmax(dist_ld[key][tbin][n]))
                numsame = np.zeros(len(dist_ld[key].keys()) + 1)
                for n in range(len(dist_hd[key][n])):
                    minH = argmin_nonzero(dist_hd[key][n])
                    maxH = np.argmax(dist_hd[key][n])
                    minmax = ['%2d/%-2d' % i for i in zip(minA[n], maxA[n])]
                    numsamepair = maxcount(minA[n])
                    numsame[numsamepair] += 1
                    # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
                print(' '.join([
                    '%4.1f%%' % i for i in (100 * (numsame / np.sum(numsame)))
                ]))

        print('Stopping HERE!')
        sys.exit(0)
        #  GAMMA FUNCTION EXPR # 8
        gamma1 = lambda a, b: (a * b)
        gamma2 = lambda a, b: (a + b) / 2

        # TODO: Factor in RMS weight
        for tbin in TEST_TBIN:
            # for tbin in sorted(bin_list):
            logging.info('')
            logging.info('BIPARTITE GRAPH for %s', str(tbin))
            bipart = {}
            edgelist = []
            for hcB in hcube_global.keys():
                num_B = hcube_global[hcB]['count']
                wgt1_B = hcube_global[hcB]['density']
                if tbin not in overlap_hcube[hcB]:
                    continue
                for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
                    edge = {}
                    if hcA not in bipart:
                        bipart[hcA] = []
                    num_proj = hcA_data['num_projected']
                    wgt_A = hcA_data['wgt']
                    wgt2_B = wgt1_B * num_proj
                    edge['combW1'] = gamma1(wgt_A, wgt1_B)
                    edge['combW2'] = gamma1(wgt_A, wgt2_B)
                    edge['combW3'] = gamma2(wgt_A, wgt1_B)
                    edge['combW4'] = gamma2(wgt_A, wgt2_B)
                    edge['num_A'] = len(hcA_data['idxlist'])
                    edge['num_B'] = num_B
                    edge['num_proj'] = num_proj
                    edge['wgt_A'] = wgt_A
                    edge['wgt1_B'] = wgt1_B
                    edge['wgt2_B'] = wgt2_B
                    edge['hcA'] = hcA
                    edge['hcB'] = hcB
                    bipart[hcA].append(edge)
                    edgelist.append((hcA, hcB, num_proj))
            if len(bipart) == 0:
                logging.info("NO DATA FOR %s", str(tbin))
                continue
            logging.info('')
            logging.info(
                'A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)'
            )
            for k, v in bipart.items():
                for edge in v:
                    logging.info(
                        'A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f'
                        % edge)
                    if self.filelog:
                        A, B = tbin
                        self.filelog.info('edge,%d_%d,%s,%s,%d', A, B,
                                          edge['hcA'], edge['hcB'],
                                          edge['num_proj'])

            # Prepare nodes for graph
            nA = set()
            nB = set()
            elist = []
            for e in edgelist:
                a, b, z = e
                if z <= 5:
                    continue
                nA.add(a)
                nB.add(b)
                elist.append((a, b, z))
            nAKeys = sorted(nA)[::-1]
            nBKeys = sorted(nB)[::-1]
            sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
            sizesB = [hcube_global[n]['count'] * 3 for n in nBKeys]
            idxA = {key: i for i, key in enumerate(nAKeys)}
            idxB = {key: i for i, key in enumerate(nBKeys)}
            edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
            G.bipartite(sizesA, sizesB, edges, sizesA, sizesB,
                        'bipartite_%d_%d' % tbin)

        logging.info('STOPPING HERE!!!!')
        sys.exit(0)
        return []
예제 #4
0
파일: reweight_check.py 프로젝트: DaMSL/ddc
    def execute(self):
      """Special execute function for the reweight operator -- check/validate.
      """
    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")
      self.cacheclient = CacheClient(self.name)
      numLabels = 5
      binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
      labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
      num_pts = len(labeled_pts_rms)
      logging.debug('##NUM_OBS: %d', num_pts)

      # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
      TEST_TBIN = [(2,0), (4,2), (2,2), (4,1), (3,1), (4,4), (0,4), (0,2), (0,1)]
      MAX_SAMPLE_SIZE   =  100   # Max # of cov traj to back project per HCube
      MAX_PT_PER_MATRIX =  100   # Num points to sample from each cov traj
      COVAR_SIZE        = 200   # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
      MAX_HCUBE         = 6      # Max Num HCubes to process


    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

      logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
      logging.info("Retrieving All Covariance Vectors")
      home = os.getenv('HOME')
      cfile = home + '/work/DEBUG_COVAR_PTS'
      DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
      if DO_COVAR: 
        if os.path.exists(cfile + '.npy'):
          covar_pts = np.load(cfile + '.npy')
          logging.debug('Loaded From File')
        else: 
          covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
          covar_pts = np.array([np.fromstring(x) for x in covar_raw])
          np.save(cfile, covar_pts)
          logging.debug('Loaded From Catalog & Saved')
      covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
      logging.debug('Indiced Loaded. Retrieving File Indices')
      covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

      if DO_COVAR: 
        logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
        logging.info("Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)")

        # FOR incrementatl PCA:
        NUM_PC = 6
        ipca_key = 'subspace:covar:ipca'
        ipca = PCAnalyzer.load(self.catalog, ipca_key)
        if ipca is None:
          logging.info('Creating a NEW IPCA')
          ipca = PCAIncremental(NUM_PC)
          lastindex = 0
        else:
          lastindex = ipca.trainsize
          logging.info('IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', 
            ipca.trainsize, len(covar_pts)-ipca.trainsize)

        # For incremental, partial solve using only newer pts (from the last "trainsize")
        if len(covar_pts)-lastindex > 0:
          ipca.solve(covar_pts[lastindex:])
          logging.info("Incrementatl PCA Updated. Storing Now...")

          ####  BARRIER 
          self.wait_catalog()
          ipca.store(self.catalog, ipca_key)

        logging.info("IPCA Saved. Projecting Covariance to PC")

      cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
      if os.path.exists(cfile + '.npy'):
        subspace_covar_pts = np.load(cfile + '.npy')
      else: 
        subspace_covar_pts = ipca.project(covar_pts)
        np.save(cfile, subspace_covar_pts)

      # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
      logging.info('Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts))
      global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle')
  

      if MAX_HCUBE <= 0:
        hcube_global = global_kdtree.getleaves()
      else:
      # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
        hcube_global_ALL = global_kdtree.getleaves()
        hcube_global = {}
        num = 0
        for k, v in hcube_global_ALL.items():
          hcube_global[k] = v
          num += 1
          if num == MAX_HCUBE:
            break

      # hcube_global = global_kdtree.getleaves()
      logging.info('Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)')
      for k in sorted(hcube_global.keys()):
        v = hcube_global[k]
        logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density'])

      if self.filelog:
        keys = hcube_global.keys()
        self.filelog.info('global,keys,%s',','.join(keys))
        self.filelog.info('global,count,%s',','.join([str(hcube_global[k]['count']) for k in keys]))
        self.filelog.info('global,volume,%s',','.join([str(hcube_global[k]['volume']) for k in keys]))
        self.filelog.info('global,density,%s',','.join([str(hcube_global[k]['density']) for k in keys]))

      logging.info("=====  SELECT Sampling of points from each Global HCube  (B)")
      s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
      hcube_global = {x[0]: x[1] for x in s}


      counter = 0
      for key in hcube_global.keys():
        counter += 1
        if hcube_global[key]['count']  <= MAX_SAMPLE_SIZE:
          cov_index = hcube_global[key]['elm']
          hcube_global[key]['samplefactor'] = 1
        else:
          cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE)
          hcube_global[key]['samplefactor'] = len(hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
        hcube_global[key]['idxlist'] = []
        for cov in cov_index:
          selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist()
          hcube_global[key]['idxlist'].extend([int(covar_index[cov]) + i for i in selected_hd_idx])
        logging.info('Back Projecting Global HCube `%s`  (%d out of %d)', key, counter, len(hcube_global.keys()))
        source_cov = self.backProjection(hcube_global[key]['idxlist'])
        hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
        logging.debug('Back Projected %d points to HD space: %s', 
          len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha']))

      # logging.info('Calculating all HD Distances')
      # dist_hd = {}
      # dist_ld = {}
      # for key in hcube_global.keys():
      #   T = hcube_global[key]['alpha'].xyz
      #   N = len(T)
      #   dist_hd[key] = np.zeros(shape=(N, N))
      #   dist_ld[key] = {}
      #   for A in range(0, N):
      #     dist_hd[key][A][A] = 0
      #     for B in range(A+1, N):
      #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])
        

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
      reservoir = ReservoirSample('rms', self.catalog)

      logging.info("=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) ")
      hcube_list = {}

      logging.info("Scanning current set of observed bins and finding all smallest with data (excluding largest 2)")
      hcube_local = {}

      logging.info("=======================================================")
      logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
      logging.info("=======================================================\n")

      overlap_hcube = {k: {} for k in hcube_global.keys()}

      projection_map = {}


      pt_projection_list = []
      for key in sorted(hcube_global.keys()):
        for i in range(len(hcube_global[key]['alpha'].xyz)):
          pt_projection_list.append([])
      for bin_idx, tbin in enumerate(TEST_TBIN):
        logging.info("Project Global HCubes into local subspace for %s", str(tbin))
        # Load Vectors
        logging.info('Loading subspace and kernel for bin %s', str(tbin))

        # LOAD KPCA Kernel matrix
        kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
        kpca = PCAnalyzer.load(self.catalog, kpca_key)

        data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
        data = np.array([np.fromstring(x) for x in data_raw])
        if len(data) == 0:
          logging.error('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
          continue


        logging.info('Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape))
        kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
        hcube_local[tbin] = kdtree.getleaves()
        logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
        for k in sorted(hcube_local[tbin].keys()):
          logging.info('    `%-9s`   #pts:%6d   density:%9.1f', 
            k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density'])

        if self.filelog:
          keys = hcube_local[tbin].keys()
          A,B = tbin
          self.filelog.info('local,%d_%d,keys,%s',A,B,','.join(keys))
          self.filelog.info('local,%d_%d,count,%s',A,B,','.join([str(hcube_local[tbin][k]['count']) for k in keys]))
          self.filelog.info('local,%d_%d,volume,%s',A,B,','.join([str(hcube_local[tbin][k]['volume']) for k in keys]))
          self.filelog.info('local,%d_%d,density,%s',A,B,','.join([str(hcube_local[tbin][k]['density']) for k in keys]))          

        n_total = 0
        logging.debug('Global Hcubes to Project (%d):  %s', len(hcube_global.keys()), str(hcube_global.keys()))
        projection_map[bin_idx] = {k: set() for k in hcube_local[tbin].keys()}
        
        pnum = 0
        for key in sorted(hcube_global.keys()):
          overlap_hcube[key][tbin] = {}
          cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

          logging.debug('PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ', 
            key, len(cov_proj_pca), str(tbin))
          for i, pt in enumerate(cov_proj_pca):
            hcube = kdtree.probe(pt, probedepth=9)
            # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
            if hcube not in overlap_hcube[key][tbin]:
              overlap_hcube[key][tbin][hcube] = {
                  'idxlist': hcube_local[tbin][hcube]['elm'],
                  'wgt': hcube_local[tbin][hcube]['density'], 
                  'num_projected': 0}
            overlap_hcube[key][tbin][hcube]['num_projected'] += 1

            # Index this point in corresponding local HCube projection view
            projection_map[bin_idx][hcube].add(pnum)

            pt_projection_list[pnum].append(hcube)
            pnum += 1

          for k, v in sorted(overlap_hcube[key][tbin].items()):
            logging.debug('   Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected'])
          # logging.info('Calculating Lower Dimensional Distances')
          # N = len(cov_proj_pca)
          # dist_ld[key][tbin] = np.zeros(shape=(N, N))
          # for A in range(0, N):
          #   for B in range(A+1, N):
          #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])


    # Re-Index projected points -- could make this a list too

      next_index = 0
      view_list = []
      for bin_idx, hcube_map in projection_map.items():
        hcube_list = []
        for hcube_key, pt_list in hcube_map.items():
          hcube_list.append((set((hcube_key,)), set(pt_list)))
        view_list.append((set((bin_idx,)), hcube_list))

      print("CALLING: Collapse Join")
      joined_subspaces = collapse_join(projection_map.keys(), view_list)
      for subspace_list, correlated_hcubes in joined_subspaces:
        tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
        for hcube_list, pt_list in correlated_hcubes:
          print(tbin_list, hcube_list, pt_list)
          # TODO: Corrlate Back to Global 
      print('Visualize HERE')



      # for idx, tbin in enumerate(TEST_TBIN):
      #   # Only process substates with data
      #   if tbin not in hcube_local:
      #     logging.warning('Local KD Tree not created for %s', str(tbin))
      #     continue
      #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))


      # sets = {}
      # proj_bin_list = []
      # for tbin in TEST_TBIN:
      #   if tbin not in hcube_local:
      #     continue
      #   proj_bin_list.append(tbin)
      #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))

      # set_list = {}
      # for tbin, view in sets.items():
      #   set_list[(tbin,)] = []
      #   for hcube, idxlist in view.items():
      #     print(tbin, hcube, idxlist)
      #     set_list[(tbin,)].append((set((hcube,)), idxlist))


      # def collapse(C):
      #   a = 0
      #   b = 0
      #   N = []
      #   while a < len(C) and b < len(C):
      #     A = sorted(C[a])
      #     B = sorted(C[b])
      #     if A == B:
      #       b += 1
      #     elif A[0] == B[0]:
      #       N.append(set(A)|set(B))
      #       b += 1
      #     else:
      #       a += 1
      #   if len(N) <= 1:
      #     return []
      #   else:
      #     return N + collapse(N)

      # q=collapse(t1)
      # for i in q: print(sorted(i))


      # print('Checking all 2-Way Joins')
      # join2 = {}
      # for a in range(0, len(proj_bin_list)-1):
      #   tA = proj_bin_list[a]
      #   for b in range(a+1, len(proj_bin_list)):
      #     tB = proj_bin_list[b]
      #     join_ss = tuple(set((tA, tB)))
      #     set_list = []
      #     for kA, vA in sets[tA].items():
      #       for kB, vB in sets[tB].items():
      #         join_hc = set((kA, kB))
      #         inter = vA & vB
      #         if len(inter) > 0:
      #           set_list.append((join_hc, inter))
      #     if len(set_list) > 0:
      #       join2[join_ss] = set_list
      # print('2-Way Join Results:')
      # for ss, set_list in join2.items():
      #   for hc, idxlist in set_list:
      #     print(ss, hc, idxlist)


      # print('Checking all 3-Way Joins')
      # join3 = []
      # checked = []
      # for a in range(0, len(join2)-1):
      #   sA, hA, vA = join2[a]
      #   for b in range(a+1, len(join2)):
      #     sB, hB, vB = join2[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join3.append((ss, hc, inter))


      # print('Checking all 4-Way Joins')
      # join4 = []
      # checked = []
      # for a in range(0, len(join3)-1):
      #   sA, hA, vA = join3[a]
      #   for b in range(a+1, len(join3)):
      #     sB, hB, vB = join3[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join4.append((ss, hc, inter))

      # if self.filelog:
      #   for i in join2:
      #     self.filelog.info('%s', str(i))
      #   for i in join3:
      #     self.filelog.info('%s', str(i))
      #   for i in join4:
      #     self.filelog.info('%s', str(i))

      DO_MIN_CHECK = False
      if DO_MIN_CHECK:
        def maxcount(x):
          y={}
          for i in x:
            y[i] = 1 if i not in y else y[i]+1
          return max(y.values())

        print('%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces')
        argmin_nonzero = lambda x: np.argmin([(i if i>0 else np.inf) for i in x])
        for key in hcube_global.keys():
          # logging.info('Showing MIN / MAX for points from HCube %s:', key)
          minA = {}; maxA={}
          for n in range(len(dist_hd[key])) :
            minA[n]=[] ; maxA[n]=[]
            for tbin in TEST_TBIN:
              if tbin not in dist_ld[key].keys():
                continue
                minA[n].append(0)
                maxA[n].append(0)          
              else:
                minA[n].append(argmin_nonzero(dist_ld[key][tbin][n]))
                maxA[n].append(np.argmax(dist_ld[key][tbin][n]))          
          numsame = np.zeros(len(dist_ld[key].keys())+1)
          for n in range(len(dist_hd[key][n])):
            minH = argmin_nonzero(dist_hd[key][n])
            maxH = np.argmax(dist_hd[key][n])
            minmax = ['%2d/%-2d'%i for i in zip(minA[n], maxA[n])]
            numsamepair = maxcount(minA[n])
            numsame[numsamepair] += 1
            # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
          print(' '.join(['%4.1f%%'%i for i in (100* (numsame/np.sum(numsame)))]))

      print('Stopping HERE!')
      sys.exit(0)
      #  GAMMA FUNCTION EXPR # 8
      gamma1 = lambda a, b : (a * b)
      gamma2 = lambda a, b : (a + b) / 2

      # TODO: Factor in RMS weight
      for tbin in TEST_TBIN:
      # for tbin in sorted(bin_list):
        logging.info('')
        logging.info('BIPARTITE GRAPH for %s', str(tbin))
        bipart = {}
        edgelist = []
        for hcB in hcube_global.keys():
          num_B  = hcube_global[hcB]['count']
          wgt1_B = hcube_global[hcB]['density']
          if tbin not in overlap_hcube[hcB]:
            continue
          for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
            edge = {}
            if hcA not in bipart:
              bipart[hcA] = []  
            num_proj  = hcA_data['num_projected']
            wgt_A  = hcA_data['wgt']
            wgt2_B = wgt1_B*num_proj
            edge['combW1'] = gamma1(wgt_A, wgt1_B)
            edge['combW2'] = gamma1(wgt_A, wgt2_B)
            edge['combW3'] = gamma2(wgt_A, wgt1_B)
            edge['combW4'] = gamma2(wgt_A, wgt2_B)
            edge['num_A']  = len(hcA_data['idxlist'])
            edge['num_B']  = num_B
            edge['num_proj']  = num_proj
            edge['wgt_A']  = wgt_A
            edge['wgt1_B'] = wgt1_B
            edge['wgt2_B'] = wgt2_B
            edge['hcA'] = hcA
            edge['hcB'] = hcB
            bipart[hcA].append(edge)
            edgelist.append((hcA, hcB, num_proj))
        if len(bipart) == 0:
          logging.info("NO DATA FOR %s", str(tbin))
          continue
        logging.info('')
        logging.info('A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)')
        for k, v in bipart.items():
          for edge in v:
            logging.info('A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge)
            if self.filelog:
              A,B = tbin
              self.filelog.info('edge,%d_%d,%s,%s,%d',A,B,edge['hcA'],edge['hcB'],edge['num_proj'])

        # Prepare nodes for graph
        nA = set()
        nB = set()
        elist = []
        for e in edgelist:
          a, b, z = e
          if z <= 5:
            continue
          nA.add(a)
          nB.add(b)
          elist.append((a,b,z))
        nAKeys = sorted(nA)[::-1]
        nBKeys = sorted(nB)[::-1]
        sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
        sizesB = [hcube_global[n]['count']*3 for n in nBKeys]
        idxA = {key: i for i, key in enumerate(nAKeys)}
        idxB = {key: i for i, key in enumerate(nBKeys)}
        edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
        G.bipartite(sizesA,sizesB,edges,sizesA,sizesB,'bipartite_%d_%d' % tbin)

      logging.info('STOPPING HERE!!!!')
      sys.exit(0)
      return []