def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, alpha=3, thresh=0): """ Internals step4 Computes gamma normalization scalar for the database annotations >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2() >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh >>> idx2_daid = invindex.idx2_daid >>> wx2_weight = wx2_idf >>> daids = invindex.daids >>> use_cache = USE_CACHE_GAMMA and False >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, daids, use_cache=use_cache) """ # Gropuing by aid and words wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs)) if utool.VERBOSE: print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh)) mark1, end1_ = utool.log_progress( '[smk_index] Gamma Group: ', len(wx_sublist), flushfreq=100, writefreq=50) rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist) aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist) daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list)) # Group by daids first and then by word index for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1): group_aids, groupxs = smk_speed.group_indicies(aids) rvecs_group = smk_speed.apply_grouping(rvecs, groupxs) # 2.9 ms for aid, rvecs_ in zip(group_aids, rvecs_group): daid2_wx2_drvecs[aid][wx] = rvecs_ if utool.VERBOSE: end1_() # For every daid, compute its gamma using pregrouped rvecs # Summation over words for each aid if utool.VERBOSE: mark2, end2_ = utool.log_progress( '[smk_index] Gamma Sum: ', len(daid2_wx2_drvecs), flushfreq=100, writefreq=25) aid_list = list(daid2_wx2_drvecs.keys()) wx2_aidrvecs_list = list(daid2_wx2_drvecs.values()) aidwxs_list = [list(wx2_aidrvecs.keys()) for wx2_aidrvecs in wx2_aidrvecs_list] aidrvecs_list = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in wx2_aidrvecs_list] aidweight_list = [[wx2_weight[wx] for wx in aidwxs] for aidwxs in aidwxs_list] #gamma_list = [] #for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list): # assert len(weight_list) == len(rvecs_list), 'one list for each word' # gamma = smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh) # 66.8 % # #weight_list = np.ones(weight_list.size) # gamma_list.append(gamma) gamma_list = [smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh) for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list)] daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma') if utool.VERBOSE: end2_() return daid2_gamma
def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, alpha=3, thresh=0): """ Internals step4 Computes gamma normalization scalar for the database annotations >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2() >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh >>> idx2_daid = invindex.idx2_daid >>> wx2_weight = wx2_idf >>> daids = invindex.daids >>> use_cache = USE_CACHE_GAMMA and False >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, daids, use_cache=use_cache) """ # Gropuing by aid and words wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs)) if utool.VERBOSE: print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh)) mark1, end1_ = utool.log_progress('[smk_index] Gamma Group: ', len(wx_sublist), flushfreq=100, writefreq=50) rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist) aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist) daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list)) # Group by daids first and then by word index for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1): group_aids, groupxs = smk_speed.group_indicies(aids) rvecs_group = smk_speed.apply_grouping(rvecs, groupxs) # 2.9 ms for aid, rvecs_ in zip(group_aids, rvecs_group): daid2_wx2_drvecs[aid][wx] = rvecs_ if utool.VERBOSE: end1_() # For every daid, compute its gamma using pregrouped rvecs # Summation over words for each aid if utool.VERBOSE: mark2, end2_ = utool.log_progress('[smk_index] Gamma Sum: ', len(daid2_wx2_drvecs), flushfreq=100, writefreq=25) aid_list = list(daid2_wx2_drvecs.keys()) wx2_aidrvecs_list = list(daid2_wx2_drvecs.values()) aidwxs_list = [ list(wx2_aidrvecs.keys()) for wx2_aidrvecs in wx2_aidrvecs_list ] aidrvecs_list = [ list(wx2_aidrvecs.values()) for wx2_aidrvecs in wx2_aidrvecs_list ] aidweight_list = [[wx2_weight[wx] for wx in aidwxs] for aidwxs in aidwxs_list] #gamma_list = [] #for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list): # assert len(weight_list) == len(rvecs_list), 'one list for each word' # gamma = smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh) # 66.8 % # #weight_list = np.ones(weight_list.size) # gamma_list.append(gamma) gamma_list = [ smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh) for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list) ] daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma') if utool.VERBOSE: end2_() return daid2_gamma
def assign_to_words_(wordflann, words, idx2_vec, idx_name='idx', dense=True, nAssign=1, with_pandas=WITH_PANDAS): """ Assigns descriptor-vectors to nearest word. Returns forward and inverted index. >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex = smk_debug.testdata_raw_internals0() >>> words = invindex.words >>> wordflann = invindex.wordflann >>> idx2_vec = invindex.idx2_dvec >>> dense = True >>> nAssign = ibs.cfg.query_cfg.smk_cfg.nAssign >>> idx_name, series_name = 'idx', 'wx2_idxs' >>> _dbargs = (wordflann, words, idx2_vec, idx_name, dense, nAssign) >>> wx2_idxs, idx2_wx = assign_to_words_(*_dbargs) """ idx2_vec_values = pdh.ensure_values(idx2_vec) # Find each vectors nearest word #TODO: multiple assignment _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec_values, nAssign) if nAssign > 1: #((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0) #_idx2_wdist[:,0] #np.sqrt(((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0)) # mutli assignment filtering as in # http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf alpha = 1.2 thresh = alpha * _idx2_wdist.T[0:1].T invalid = _idx2_wdist >= thresh # Weighting as in Lost in Quantization sigma = 80 unnorm_weight = np.exp(np.divide(-_idx2_wdist.astype(np.float64), 2 * (sigma ** 2))) masked_weight = np.ma.masked_array(unnorm_weight, mask=invalid) weight = masked_weight / masked_weight.sum(axis=1)[:, np.newaxis] masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid) idx2_wxs = map(utool.filter_Nones, masked_wxs.tolist()) idx2_wx_weights = map(utool.filter_Nones, weight.tolist()) #masked_weight1 = np.ma.masked_array(_idx2_wdist, mask=invalid) #weight1 = masked_weight1 / masked_weight1.sum(axis=1)[:, np.newaxis] # multiple assignment weight: expt(-(d ** 2) / (2 * sigma ** 2)) # The distance d_0 is used to filter asignments with distance less than # alpha * d_0 where alpha = 1.2 PANDAS_GROUP = True or with_pandas # Compute inverted index if PANDAS_GROUP: # Pandas grouping seems to be faster in this instance word_assignments = pd.DataFrame(_idx2_wx, columns=['wx']) # 141 us word_group = word_assignments.groupby('wx') # 34.5 us _wx2_idxs = word_group['wx'].indices # 8.6 us else: idx2_idx = np.arange(len(idx2_vec)) wx_list, groupxs = smk_speed.group_indicies(_idx2_wx) # 5.52 ms idxs_list = smk_speed.apply_grouping(idx2_idx, groupxs) # 2.9 ms _wx2_idxs = dict(zip(wx_list, idxs_list)) # 753 us # if with_pandas: idx_series = pdh.ensure_index(idx2_vec) wx_series = pdh.ensure_index(words) wx2_idxs = pdh.pandasify_dict1d( _wx2_idxs, wx_series, idx_name, ('wx2_' + idx_name + 's'), dense=dense) # 274 ms 97.4 % idx2_wx = pdh.IntSeries(_idx2_wx, index=idx_series, name='wx') else: if dense: wx2_idxs = { wx: _wx2_idxs[wx].astype(INDEX_TYPE) if wx in _wx2_idxs else np.empty(0, dtype=INDEX_TYPE) for wx in range(len(words)) } #wx2_idxs = _wx2_idxs #for wx in range(len(words)): # if wx not in wx2_idxs: # wx2_idxs[wx] = np.empty(0, dtype=INDEX_TYPE) else: wx2_idxs = _wx2_idxs idx2_wx = _idx2_wx return wx2_idxs, idx2_wx
def assign_to_words_(wordflann, words, idx2_vec, idx_name='idx', dense=True, nAssign=1, with_pandas=WITH_PANDAS): """ Assigns descriptor-vectors to nearest word. Returns forward and inverted index. >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex = smk_debug.testdata_raw_internals0() >>> words = invindex.words >>> wordflann = invindex.wordflann >>> idx2_vec = invindex.idx2_dvec >>> dense = True >>> nAssign = ibs.cfg.query_cfg.smk_cfg.nAssign >>> idx_name, series_name = 'idx', 'wx2_idxs' >>> _dbargs = (wordflann, words, idx2_vec, idx_name, dense, nAssign) >>> wx2_idxs, idx2_wx = assign_to_words_(*_dbargs) """ idx2_vec_values = pdh.ensure_values(idx2_vec) # Find each vectors nearest word #TODO: multiple assignment _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec_values, nAssign) if nAssign > 1: #((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0) #_idx2_wdist[:,0] #np.sqrt(((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0)) # mutli assignment filtering as in # http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf alpha = 1.2 thresh = alpha * _idx2_wdist.T[0:1].T invalid = _idx2_wdist >= thresh # Weighting as in Lost in Quantization sigma = 80 unnorm_weight = np.exp( np.divide(-_idx2_wdist.astype(np.float64), 2 * (sigma**2))) masked_weight = np.ma.masked_array(unnorm_weight, mask=invalid) weight = masked_weight / masked_weight.sum(axis=1)[:, np.newaxis] masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid) idx2_wxs = map(utool.filter_Nones, masked_wxs.tolist()) idx2_wx_weights = map(utool.filter_Nones, weight.tolist()) #masked_weight1 = np.ma.masked_array(_idx2_wdist, mask=invalid) #weight1 = masked_weight1 / masked_weight1.sum(axis=1)[:, np.newaxis] # multiple assignment weight: expt(-(d ** 2) / (2 * sigma ** 2)) # The distance d_0 is used to filter asignments with distance less than # alpha * d_0 where alpha = 1.2 PANDAS_GROUP = True or with_pandas # Compute inverted index if PANDAS_GROUP: # Pandas grouping seems to be faster in this instance word_assignments = pd.DataFrame(_idx2_wx, columns=['wx']) # 141 us word_group = word_assignments.groupby('wx') # 34.5 us _wx2_idxs = word_group['wx'].indices # 8.6 us else: idx2_idx = np.arange(len(idx2_vec)) wx_list, groupxs = smk_speed.group_indicies(_idx2_wx) # 5.52 ms idxs_list = smk_speed.apply_grouping(idx2_idx, groupxs) # 2.9 ms _wx2_idxs = dict(zip(wx_list, idxs_list)) # 753 us # if with_pandas: idx_series = pdh.ensure_index(idx2_vec) wx_series = pdh.ensure_index(words) wx2_idxs = pdh.pandasify_dict1d(_wx2_idxs, wx_series, idx_name, ('wx2_' + idx_name + 's'), dense=dense) # 274 ms 97.4 % idx2_wx = pdh.IntSeries(_idx2_wx, index=idx_series, name='wx') else: if dense: wx2_idxs = { wx: _wx2_idxs[wx].astype(INDEX_TYPE) if wx in _wx2_idxs else np.empty(0, dtype=INDEX_TYPE) for wx in range(len(words)) } #wx2_idxs = _wx2_idxs #for wx in range(len(words)): # if wx not in wx2_idxs: # wx2_idxs[wx] = np.empty(0, dtype=INDEX_TYPE) else: wx2_idxs = _wx2_idxs idx2_wx = _idx2_wx return wx2_idxs, idx2_wx