def batch_query(em, force_recomp=False, test_cxs=None): '''Runs each test_cxs as a query. If test_cxs is None, then all queries are run''' 'TODO: Fix up the VM dependencies' vm, iom, am, cm = em.hs.get_managers('vm', 'iom', 'am', 'cm') # Compute the matches qm = vm.hs.qm vm.sample_train_set() vm.build_model(force_recomp=force_recomp) if test_cxs == None: test_cxs = vm.get_train_cx() logmsg('Building matching graph. This may take awhile') depends = ['chiprep', 'preproc', 'model', 'query'] algo_suffix = am.get_algo_suffix(depends) samp_suffix = vm.get_samp_suffix() result_dpath = iom.ensure_directory(iom.get_temp_fpath('raw_results')) rr_fmtstr_cid = os.path.join( result_dpath, 'rr_cid%07d' + samp_suffix + algo_suffix + '.pkl') # Find the Queries which need to be run unsaved_cxs = [] for cx in iter(test_cxs): cid = cm.cx2_cid[cx] rr_fpath = rr_fmtstr_cid % cid if not os.path.exists(rr_fpath): unsaved_cxs.append(cx) # Run Unsaved Query total = len(unsaved_cxs) for count, cx in enumerate(unsaved_cxs): logmsg('Query %d/%d' % (count, total)) em.run_and_save_query(cx, rr_fmtstr_cid) # Read Each Query cx2_rr = alloc_lists(test_cxs.max() + 1) total = len(test_cxs) for count, cx in enumerate(test_cxs): logmsg('Loading Result %d/%d' % (count, total)) cid = cm.cx2_cid[cx] rr_fpath = rr_fmtstr_cid % cid if not os.path.exists(rr_fpath): logwarn('Result does not exist for CID=%d' % cid) rr_file = open(rr_fpath, 'rb') try: rr = cPickle.load(rr_file) except EOFError: rr_file.close() os.remove(rr_fpath) logwarn('Result was corrupted for CID=%d' % cid) rr_file.close() rr.cx2_cscore_ = [] rr.cx2_fs_ = [] rr.qfdsc = [] rr.qfpts = [] cx2_rr[cx] = rr return cx2_rr
def batch_query(em, force_recomp=False, test_cxs=None): '''Runs each test_cxs as a query. If test_cxs is None, then all queries are run''' 'TODO: Fix up the VM dependencies' vm, iom, am, cm = em.hs.get_managers('vm','iom','am', 'cm') # Compute the matches qm = vm.hs.qm vm.sample_train_set() vm.build_model(force_recomp=force_recomp) if test_cxs == None: test_cxs = vm.get_train_cx() logmsg('Building matching graph. This may take awhile') depends = ['chiprep','preproc','model','query'] algo_suffix = am.get_algo_suffix(depends) samp_suffix = vm.get_samp_suffix() result_dpath = iom.ensure_directory(iom.get_temp_fpath('raw_results')) rr_fmtstr_cid = os.path.join(result_dpath, 'rr_cid%07d'+samp_suffix+algo_suffix+'.pkl') # Find the Queries which need to be run unsaved_cxs = [] for cx in iter(test_cxs): cid = cm.cx2_cid[cx] rr_fpath = rr_fmtstr_cid % cid if not os.path.exists(rr_fpath): unsaved_cxs.append(cx) # Run Unsaved Query total = len(unsaved_cxs) for count, cx in enumerate(unsaved_cxs): logmsg('Query %d/%d' % (count, total)) em.run_and_save_query(cx, rr_fmtstr_cid) # Read Each Query cx2_rr = alloc_lists(test_cxs.max()+1) total = len(test_cxs) for count, cx in enumerate(test_cxs): logmsg('Loading Result %d/%d' % (count, total)) cid = cm.cx2_cid[cx] rr_fpath = rr_fmtstr_cid % cid if not os.path.exists(rr_fpath): logwarn('Result does not exist for CID=%d' % cid) rr_file = open(rr_fpath,'rb') try: rr = cPickle.load(rr_file) except EOFError: rr_file.close() os.remove(rr_fpath) logwarn('Result was corrupted for CID=%d' % cid) rr_file.close() rr.cx2_cscore_ = [] rr.cx2_fs_ = [] rr.qfdsc = [] rr.qfpts = [] cx2_rr[cx] = rr return cx2_rr
def query_db_vs_db(hsA, hsB): 'Runs cross database queries / reloads cross database queries' vs_str = get_results_name(hsA, hsB) print('Running/Loading ' + vs_str) query_cxs = hsA.cm.get_valid_cxs() total = len(query_cxs) cx2_rr = alloc_lists(total) for count, qcx in enumerate(query_cxs): #with Timer() as t: #print(('Query %d / %d ' % (count, total)) + vs_str) rr = hsB.qm.cx2_rr(qcx, hsA) cx2_rr[count] = rr return cx2_rr
def query_db_vs_db(hsA, hsB): 'Runs cross database queries / reloads cross database queries' vs_str = get_results_name(hsA, hsB) print('Running/Loading '+vs_str) query_cxs = hsA.cm.get_valid_cxs() total = len(query_cxs) cx2_rr = alloc_lists(total) for count, qcx in enumerate(query_cxs): #with Timer() as t: #print(('Query %d / %d ' % (count, total)) + vs_str) rr = hsB.qm.cx2_rr(qcx, hsA) cx2_rr[count] = rr return cx2_rr
def approximate_kmeans(data, K=1e6, max_iters=1000, flann_pref=None): if flann_pref == None: flann_pref = Pref() flann_pref.algorithm = Pref("kdtree") flann_pref.trees = Pref(8) flann_pref.checks = Pref(128) flann_args = flann_pref.to_dict() float_data = np.array(data, dtype=np.float32) N = float_data.shape[0] print ("Approximately clustering %d data vectors into %d clusters" % (N, K)) np.random.seed(seed=0) # For Reproducibility # Initialize to Random Cluster Centers centx = np.random.choice(N, size=K, replace=False) cent = np.copy(float_data[centx]) assign = alloc_lists(K) # List for each cluster center with assigned indexes for iterx in xrange(0, max_iters): print "Iteration " + str(iterx) # Step 1: Find Nearest Neighbors flann = FLANN() flann.build_index(data_vecs, **flann_args) (index_list, dist_list) = flann.nn_index(query_vecs, K, checks=flann_args["checks"]) return (index_list, dist_list) datax2_centx, _ = flann_one_time(cent, float_data, 1, flann_args) # Step 2: Assign data to cluster centers datax_sort = datax2_centx.argsort() centx_sort = datax2_centx[datax_sort] # Efficiently Trace over sorted centers with two pointers. Take care # To include the last batch of datavecs with the same center_index converged = True prev_centx = -1 _L = 0 dbg_total_assigned = 0 dbg_assigned_list = [] for _R in xrange(N + 1): # Loop over datapoints, going 1 past the end, and group them # data = 0[ . . . . . . . . . . . . .]N # ptrs = L R # |- k -|L R # |- k+1 |L R # |_K| if _R == N or centx_sort[_L] != centx_sort[_R]: # We found a group centx = centx_sort[_L] # Assign this group cluster index: centx # SPECIAL CASE: ( akmeans might not assign everything ) if centx - prev_centx > 1: # Check if a cluster got skipped for skipx in xrange(prev_centx + 1, centx): print (" Skipping Index:" + str(skipx)) if len(assign[skipx]) != 0: converged = False assign[skipx] = [] prev_centx = centx # Set Assignments num_members = np.float32(_R - _L) dbg_total_assigned += num_members centx_membx = datax_sort[_L:_R] # DBG CODE, keep track of data vectors you've assigned # print(' Assigning %d data vectors to center index: %d' % (num_members, centx) ) # for x in centx_membx: # dbg_assigned_list.append(x) # /DBGCODE if np.all(assign[centx] != centx_membx): converged = False assign[centx] = centx_membx # Recompute Centers cent[centx] = float_data[centx_membx, :].sum(axis=0) / num_members _L = _R # print(' Did Assignment of %d centers' % prev_centx) # print(' Assigned %d datavectors in total' % dbg_total_assigned) # SPECIAL CASE: has to run at the end again if prev_centx < K: # Check if a cluster got skipped at the end for skipx in xrange(prev_centx + 1, K): print (" Cluster Index %d was empty:" % skipx) if len(assign[skipx]) != 0: converged = False assign[skipx] = [] prev_centx = centx if converged: # Assignments have not changed print "akmeans converged in " + str(iterx) + " iterations" break return cent, assign
def approximate_kmeans(data, K=1e6, max_iters=1000, flann_pref=None): if flann_pref == None: flann_pref = Pref() flann_pref.algorithm = Pref('kdtree') flann_pref.trees = Pref(8) flann_pref.checks = Pref(128) flann_args = flann_pref.to_dict() float_data = np.array(data, dtype=np.float32) N = float_data.shape[0] print('Approximately clustering %d data vectors into %d clusters' % (N, K)) np.random.seed(seed=0) # For Reproducibility # Initialize to Random Cluster Centers centx = np.random.choice(N, size=K, replace=False) cent = np.copy(float_data[centx]) assign = alloc_lists( K) # List for each cluster center with assigned indexes for iterx in xrange(0, max_iters): print "Iteration " + str(iterx) # Step 1: Find Nearest Neighbors flann = FLANN() flann.build_index(data_vecs, **flann_args) (index_list, dist_list) = flann.nn_index(query_vecs, K, checks=flann_args['checks']) return (index_list, dist_list) datax2_centx, _ = flann_one_time(cent, float_data, 1, flann_args) # Step 2: Assign data to cluster centers datax_sort = datax2_centx.argsort() centx_sort = datax2_centx[datax_sort] # Efficiently Trace over sorted centers with two pointers. Take care # To include the last batch of datavecs with the same center_index converged = True prev_centx = -1 _L = 0 dbg_total_assigned = 0 dbg_assigned_list = [] for _R in xrange( N + 1 ): #Loop over datapoints, going 1 past the end, and group them # data = 0[ . . . . . . . . . . . . .]N # ptrs = L R # |- k -|L R # |- k+1 |L R # |_K| if _R == N or centx_sort[_L] != centx_sort[_R]: # We found a group centx = centx_sort[ _L] # Assign this group cluster index: centx # SPECIAL CASE: ( akmeans might not assign everything ) if centx - prev_centx > 1: #Check if a cluster got skipped for skipx in xrange(prev_centx + 1, centx): print(" Skipping Index:" + str(skipx)) if len(assign[skipx]) != 0: converged = False assign[skipx] = [] prev_centx = centx # Set Assignments num_members = np.float32(_R - _L) dbg_total_assigned += num_members centx_membx = datax_sort[_L:_R] #DBG CODE, keep track of data vectors you've assigned #print(' Assigning %d data vectors to center index: %d' % (num_members, centx) ) #for x in centx_membx: #dbg_assigned_list.append(x) #/DBGCODE if np.all(assign[centx] != centx_membx): converged = False assign[centx] = centx_membx # Recompute Centers cent[centx] = float_data[centx_membx, :].sum( axis=0) / num_members _L = _R #print(' Did Assignment of %d centers' % prev_centx) #print(' Assigned %d datavectors in total' % dbg_total_assigned) # SPECIAL CASE: has to run at the end again if prev_centx < K: #Check if a cluster got skipped at the end for skipx in xrange(prev_centx + 1, K): print(' Cluster Index %d was empty:' % skipx) if len(assign[skipx]) != 0: converged = False assign[skipx] = [] prev_centx = centx if converged: # Assignments have not changed print 'akmeans converged in ' + str(iterx) + ' iterations' break return cent, assign
def assign_feature_matches_1vM(rr, hs, K, method, cids_to_remove): '''Assigns each query feature to its K nearest database features with a similarity-score. Each feature votes for its assigned chip with this weight.''' logdbg('Assigning feature matches and initial scores') # Get managers cm = hs.cm nm = hs.nm vm = hs.vm # Get intermediate results qcx = rr.qcx qcid = rr.qcid qfdsc = rr.qfdsc qfpts = rr.qfpts num_qf = qfpts.shape[0] # define: Prefix K = list of K+1 nearest; k = K nearest # Everything is done in a flat manner, and reshaped at the end. if len(cids_to_remove) > 0: K += len(cids_to_remove) logdbg('K = %d. Increased by %d to account for removing results' % (K, len(cids_to_remove))) # qfx = Query Feature Index # Kwxs = the Kth result word index ; Kdists = the Kth result distance (qfx2_Kwxs, qfx2_Kdists) = vm.nearest_neighbors(qfdsc, K + 1) # --- # Candidate score the nearest neighbor matches # p - pth nearest ; o - k+1th nearest score_fn_dict = { 'DIFF': lambda p, o: o - p, 'RAT': lambda p, o: o / p, 'LNRAT': lambda p, o: np.log2(o / p), 'COUNT': lambda p, o: 1, 'NDIST': lambda p, o: 10e16 - p, 'TFIDF': lambda wx2_tf, wx_idf, wx: wx2_tf[wx] * wx_idf[wx] } score_fn = score_fn_dict[method] if method == 'TFIDF': # The wx2_qtf could really be per k or as agged across all K w_histo = bincount(qfx2_wxs, minlength=vm.numWords()) wx2_qtf = np.array(w_histo, dtype=np.float32) / num_qf qfx2_vweight = score_fn(wx2_qtf, vm.wx2_idf, qfx2_wxs) else: # Distances to the 0-K results p_vote = qfx2_Kdists[:, 0:K] + 1 # Distance to the K+1th result o_norm = np.tile(qfx2_Kdists[:, -1].reshape(num_qf, 1) + 1, (1, K)) # Use score method to get weight qfx2_kweight = np.array( [score_fn(p, o) for (p, o) in iter(zip(p_vote.flat, o_norm.flat))], dtype=np.float32) qfx2_kweight.shape = (num_qf, K) # --- # Use the scores to cast weighted votes for database chips # if len(cids_to_remove) > 0: # Remove the query from results # query feature index 2 agg descriptor indexes -> cids -> self_query_bit -> clean_axs # # Feature Matches -> Chip Ids logdbg('Query qcid=%r are being removed from results ' % cids_to_remove) qfx2_Kaxs_ = vm.wx2_axs[qfx2_Kwxs] qfx2_Kcids_ = [vm.ax2_cid[axs] for axs in qfx2_Kaxs_.flat] # Test if each FeatureMatch-ChipId is the Query-ChipId. qfx2_Ksqbit_ = [ True - np.in1d(cids, cids_to_remove) for cids in qfx2_Kcids_ ] # Remove FeatureMatches to the Query-ChipId qfx2_Kaxs = [np.array(axs)[sqbit].tolist() for (axs, sqbit) in\ iter(zip(qfx2_Kaxs_.flat, qfx2_Ksqbit_))] else: qfx2_Kaxs_ = vm.wx2_axs[qfx2_Kwxs] qfx2_Kaxs = [np.array(axs).tolist() for axs in qfx2_Kaxs_.flat] # Clean Vote for Info qfx2_Kcxs = np.array([vm.ax2_cx(axs) for axs in qfx2_Kaxs]) qfx2_Kfxs = np.array([vm.ax2_fx[axs] for axs in qfx2_Kaxs]) qfx2_Knxs = np.array([cm.cx2_nx[cxs] for cxs in qfx2_Kcxs]) if qfx2_Kfxs.size == 0: logerr('Cannot query when there is one chip in database') # Reshape Vote for Info qfx2_Kcxs = np.array(qfx2_Kcxs).reshape(num_qf, K + 1) qfx2_Kfxs = np.array(qfx2_Kfxs).reshape(num_qf, K + 1) qfx2_Knxs = np.array(qfx2_Knxs).reshape(num_qf, K + 1) # Using the K=K+1 results, make k=K scores qfx2_kcxs_vote = qfx2_Kcxs[:, 0:K] # vote for cx qfx2_kfxs_vote = qfx2_Kfxs[:, 0:K] # vote for fx qfx2_knxs_vote = qfx2_Knxs[:, 0:K] # check with nx # Attempt to recover from problems where K is too small qfx2_knxs_norm = np.tile(qfx2_Knxs[:, K].reshape(num_qf, 1), (1, K)) qfx2_knxs_norm[qfx2_knxs_norm == nm.UNIDEN_NX()] = 0 # Remove Unidentifieds from this test qfx2_kcxs_norm = np.tile(qfx2_Kcxs[:, K].reshape(num_qf, 1), (1, K)) # If the normalizer has the same name, but is a different chip, there is a good chance # it is a correct match and was peanalized by the scoring function qfx2_normgood_bit = np.logical_and(qfx2_kcxs_vote != qfx2_kcxs_norm, \ qfx2_knxs_vote == qfx2_knxs_norm) #qfx2_kweight[qfx2_normgood_bit] = 2 # ----- # Build FeatureMatches and FeaturesScores # cx2_fm = alloc_lists(cm.max_cx + 1) cx2_fs_ = alloc_lists(cm.max_cx + 1) qfx2_qfx = np.tile(np.arange(0, num_qf).reshape(num_qf, 1), (1, K)) # Add matches and scores for (qfx, qfs, cxs, fxs)\ in iter(zip(qfx2_qfx.flat, \ qfx2_kweight.flat, \ qfx2_kcxs_vote.flat, \ qfx2_kfxs_vote.flat)): if cxs.size == 0: continue for (vote_cx, vote_fx) in iter(zip(np.nditer(cxs), np.nditer(fxs))): cx2_fm[vote_cx].append((qfx, vote_fx)) cx2_fs_[vote_cx].append(qfs) # Convert correspondences to to numpy for cx in xrange(len(cx2_fs_)): num_m = len(cx2_fm[cx]) cx2_fs_[cx] = np.array(cx2_fs_[cx], dtype=np.float32) cx2_fm[cx] = np.array(cx2_fm[cx], dtype=np.uint32).reshape(num_m, 2) logdbg('Setting feature assignments') rr.cx2_fm = cx2_fm rr.cx2_fs_ = cx2_fs_
def assign_feature_matches_1vM(rr, hs, K, method, cids_to_remove): '''Assigns each query feature to its K nearest database features with a similarity-score. Each feature votes for its assigned chip with this weight.''' logdbg('Assigning feature matches and initial scores') # Get managers cm = hs.cm nm = hs.nm vm = hs.vm # Get intermediate results qcx = rr.qcx qcid = rr.qcid qfdsc = rr.qfdsc qfpts = rr.qfpts num_qf = qfpts.shape[0] # define: Prefix K = list of K+1 nearest; k = K nearest # Everything is done in a flat manner, and reshaped at the end. if len(cids_to_remove) > 0: K += len(cids_to_remove) logdbg('K = %d. Increased by %d to account for removing results' % (K, len(cids_to_remove))) # qfx = Query Feature Index # Kwxs = the Kth result word index ; Kdists = the Kth result distance (qfx2_Kwxs, qfx2_Kdists) = vm.nearest_neighbors(qfdsc, K+1) # --- # Candidate score the nearest neighbor matches # p - pth nearest ; o - k+1th nearest score_fn_dict = { 'DIFF' : lambda p, o: o - p, 'RAT' : lambda p, o: o / p, 'LNRAT' : lambda p, o: np.log2(o / p), 'COUNT' : lambda p, o: 1, 'NDIST' : lambda p, o: 10e16 - p, 'TFIDF' : lambda wx2_tf, wx_idf, wx: wx2_tf[wx] * wx_idf[wx] } score_fn = score_fn_dict[method] if method == 'TFIDF': # The wx2_qtf could really be per k or as agged across all K w_histo = bincount(qfx2_wxs, minlength=vm.numWords()) wx2_qtf = np.array(w_histo, dtype=np.float32) / num_qf qfx2_vweight = score_fn(wx2_qtf, vm.wx2_idf, qfx2_wxs) else: # Distances to the 0-K results p_vote = qfx2_Kdists[:, 0:K] + 1 # Distance to the K+1th result o_norm = np.tile( qfx2_Kdists[:, -1].reshape(num_qf, 1) + 1, (1, K)) # Use score method to get weight qfx2_kweight = np.array( [score_fn(p, o) for (p, o) in iter(zip(p_vote.flat, o_norm.flat))], dtype=np.float32) qfx2_kweight.shape = (num_qf, K) # --- # Use the scores to cast weighted votes for database chips # if len(cids_to_remove) > 0: # Remove the query from results # query feature index 2 agg descriptor indexes -> cids -> self_query_bit -> clean_axs # # Feature Matches -> Chip Ids logdbg('Query qcid=%r are being removed from results ' % cids_to_remove) qfx2_Kaxs_ = vm.wx2_axs[qfx2_Kwxs] qfx2_Kcids_ = [vm.ax2_cid[axs] for axs in qfx2_Kaxs_.flat] # Test if each FeatureMatch-ChipId is the Query-ChipId. qfx2_Ksqbit_ = [True - np.in1d(cids, cids_to_remove) for cids in qfx2_Kcids_] # Remove FeatureMatches to the Query-ChipId qfx2_Kaxs = [np.array(axs)[sqbit].tolist() for (axs, sqbit) in\ iter(zip(qfx2_Kaxs_.flat, qfx2_Ksqbit_))] else: qfx2_Kaxs_ = vm.wx2_axs[qfx2_Kwxs] qfx2_Kaxs = [np.array(axs).tolist() for axs in qfx2_Kaxs_.flat] # Clean Vote for Info qfx2_Kcxs = np.array([vm.ax2_cx(axs) for axs in qfx2_Kaxs]) qfx2_Kfxs = np.array([vm.ax2_fx[axs] for axs in qfx2_Kaxs]) qfx2_Knxs = np.array([cm.cx2_nx[cxs] for cxs in qfx2_Kcxs]) if qfx2_Kfxs.size == 0: logerr('Cannot query when there is one chip in database') # Reshape Vote for Info qfx2_Kcxs = np.array(qfx2_Kcxs).reshape(num_qf, K+1) qfx2_Kfxs = np.array(qfx2_Kfxs).reshape(num_qf, K+1) qfx2_Knxs = np.array(qfx2_Knxs).reshape(num_qf, K+1) # Using the K=K+1 results, make k=K scores qfx2_kcxs_vote = qfx2_Kcxs[:, 0:K] # vote for cx qfx2_kfxs_vote = qfx2_Kfxs[:, 0:K] # vote for fx qfx2_knxs_vote = qfx2_Knxs[:, 0:K] # check with nx # Attempt to recover from problems where K is too small qfx2_knxs_norm = np.tile(qfx2_Knxs[:, K].reshape(num_qf, 1), (1, K)) qfx2_knxs_norm[qfx2_knxs_norm == nm.UNIDEN_NX()] = 0 # Remove Unidentifieds from this test qfx2_kcxs_norm = np.tile(qfx2_Kcxs[:, K].reshape(num_qf, 1), (1, K)) # If the normalizer has the same name, but is a different chip, there is a good chance # it is a correct match and was peanalized by the scoring function qfx2_normgood_bit = np.logical_and(qfx2_kcxs_vote != qfx2_kcxs_norm, \ qfx2_knxs_vote == qfx2_knxs_norm) #qfx2_kweight[qfx2_normgood_bit] = 2 # ----- # Build FeatureMatches and FeaturesScores # cx2_fm = alloc_lists(cm.max_cx + 1) cx2_fs_ = alloc_lists(cm.max_cx + 1) qfx2_qfx = np.tile(np.arange(0, num_qf).reshape(num_qf, 1), (1, K)) # Add matches and scores for (qfx, qfs, cxs, fxs)\ in iter(zip(qfx2_qfx.flat, \ qfx2_kweight.flat, \ qfx2_kcxs_vote.flat, \ qfx2_kfxs_vote.flat)): if cxs.size == 0: continue for (vote_cx, vote_fx) in iter(zip(np.nditer(cxs), np.nditer(fxs))): cx2_fm[vote_cx].append((qfx, vote_fx)) cx2_fs_[vote_cx].append(qfs) # Convert correspondences to to numpy for cx in xrange(len(cx2_fs_)): num_m = len(cx2_fm[cx]) cx2_fs_[cx] = np.array(cx2_fs_[cx], dtype=np.float32) cx2_fm[cx] = np.array(cx2_fm[cx], dtype=np.uint32).reshape(num_m, 2) logdbg('Setting feature assignments') rr.cx2_fm = cx2_fm rr.cx2_fs_ = cx2_fs_