def __quantize_desc_to_tfidf_vvec(desc, wx2_idf, words, words_flann): # Assign each descriptor to its nearest visual word #desc = np.array(desc_, params.__BOW_DTYPE__) fx2_wx, _ = words_flann.nn_index(desc, 1, checks=128) #TODO: soft assignment here # Build sparse visual vectors with term frequency weights lil_vvec = spsparse.lil_matrix((len(words), 1)) for wx in iter(fx2_wx): lil_vvec[wx, 0] += 1 tf_vvec = spsparse.csr_matrix(lil_vvec.T, copy=False) # Compute tf-idf tfidf_vvec = algos.sparse_multiply_rows(tf_vvec, wx2_idf) # Normalize vvec = algos.sparse_normalize_rows(tfidf_vvec) return vvec, fx2_wx
def __quantize_desc_to_tfidf_vvec(desc, wx2_idf, words, words_flann): # Assign each descriptor to its nearest visual word #desc = np.array(desc_, params.__BOW_DTYPE__) fx2_wx, _ = words_flann.nn_index(desc, 1, checks=128) #TODO: soft assignment here # Build sparse visual vectors with term frequency weights lil_vvec = spsparse.lil_matrix((len(words),1)) for wx in iter(fx2_wx): lil_vvec[wx, 0] += 1 tf_vvec = spsparse.csr_matrix(lil_vvec.T, copy=False) # Compute tf-idf tfidf_vvec = algos.sparse_multiply_rows(tf_vvec, wx2_idf) # Normalize vvec = algos.sparse_normalize_rows(tfidf_vvec) return vvec, fx2_wx
def __index_database_to_vocabulary(cx2_desc, words, words_flann, indexed_cxs, cache_dir): '''Assigns each database chip a visual-vector and returns data for the inverted file''' # TODO: Save precomputations here print('[mc2] Assigning each database chip a bag-of-words vector') num_indexed = len(indexed_cxs) ax2_cx, ax2_fx, ax2_desc = __aggregate_descriptors(cx2_desc, indexed_cxs) # Build UID matcher_uid = params.get_matcher_uid() data_uid = helpers.hashstr(ax2_desc) uid = data_uid + '_' + matcher_uid try: cx2_vvec = io.smart_load(cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse wx2_cxs = io.smart_load(cache_dir, 'wx2_cxs', uid, '.npy') wx2_fxs = io.smart_load(cache_dir, 'wx2_fxs', uid, '.npy') wx2_idf = io.smart_load(cache_dir, 'wx2_idf', uid, '.npy') print('[mc2] successful cache load: vocabulary indexed databased.') return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf #helpers.CacheException as ex: except IOError as ex: print(repr(ex)) print('[mc2] quantizing each descriptor to a word') # Assign each descriptor to its nearest visual word print('[mc2] ...this may take awhile with no indication of progress') tt1 = helpers.Timer('quantizing each descriptor to a word') ax2_wx, _ = words_flann.nn_index(ax2_desc, 1, checks=128) tt1.toc() # Build inverse word to ax tt2 = helpers.Timer('database_indexing') print('') print('[mc2] building inverse word to ax map') wx2_axs = [[] for _ in xrange(len(words))] for ax, wx in enumerate(ax2_wx): wx2_axs[wx].append(ax) # Compute inverted file: words -> database print('[mc2] building inverted file word -> database') wx2_cxs = np.array([[ax2_cx[ax] for ax in ax_list] for ax_list in wx2_axs]) wx2_fxs = np.array([[ax2_fx[ax] for ax in ax_list] for ax_list in wx2_axs]) # Build sparse visual vectors with term frequency weights print('[mc2] building sparse visual words') coo_cols = ax2_wx coo_rows = ax2_cx coo_values = np.ones(len(ax2_cx), dtype=BOW_DTYPE) coo_format = (coo_values, (coo_rows, coo_cols)) coo_cx2_vvec = spsparse.coo_matrix(coo_format, dtype=np.float, copy=True) cx2_tf_vvec = spsparse.csr_matrix(coo_cx2_vvec, copy=False) # Compute idf_w = log(Number of documents / Number of docs containing word_j) print('[mc2] computing tf-idf') wx2_df = np.array([len(set(cxs)) + 1 for cxs in wx2_cxs], dtype=np.float) wx2_idf = np.array(np.log2(np.float(num_indexed) / wx2_df)) # Compute tf-idf print('[mc2] preweighting with tf-idf') cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf) # Normalize print('[mc2] normalizing') cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf) cx2_vvec = algos.sparse_normalize_rows(cx2_tfidf_vvec) tt2.toc() # Save to cache print('[mc2] saving to cache') r''' input_data = ax2_desc data = cx2_vvec uid='cx2_vvec'+matcher_uid ''' io.smart_save(cx2_vvec, cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse io.smart_save(wx2_cxs, cache_dir, 'wx2_cxs', uid, '.npy') io.smart_save(wx2_fxs, cache_dir, 'wx2_fxs', uid, '.npy') io.smart_save(wx2_idf, cache_dir, 'wx2_idf', uid, '.npy') return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf
def __index_database_to_vocabulary(cx2_desc, words, words_flann, indexed_cxs, cache_dir): '''Assigns each database chip a visual-vector and returns data for the inverted file''' # TODO: Save precomputations here print('[mc2] Assigning each database chip a bag-of-words vector') num_indexed = len(indexed_cxs) ax2_cx, ax2_fx, ax2_desc = __aggregate_descriptors(cx2_desc, indexed_cxs) # Build UID matcher_uid = params.get_matcher_uid() data_uid = helpers.hashstr(ax2_desc) uid = data_uid + '_' + matcher_uid try: cx2_vvec = io.smart_load(cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse wx2_cxs = io.smart_load(cache_dir, 'wx2_cxs', uid, '.npy') wx2_fxs = io.smart_load(cache_dir, 'wx2_fxs', uid, '.npy') wx2_idf = io.smart_load(cache_dir, 'wx2_idf', uid, '.npy') print('[mc2] successful cache load: vocabulary indexed databased.') return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf #helpers.CacheException as ex: except IOError as ex: print(repr(ex)) print('[mc2] quantizing each descriptor to a word') # Assign each descriptor to its nearest visual word print('[mc2] ...this may take awhile with no indication of progress') tt1 = helpers.Timer('quantizing each descriptor to a word') ax2_wx, _ = words_flann.nn_index(ax2_desc, 1, checks=128) tt1.toc() # Build inverse word to ax tt2 = helpers.Timer('database_indexing') print('') print('[mc2] building inverse word to ax map') wx2_axs = [[] for _ in xrange(len(words))] for ax, wx in enumerate(ax2_wx): wx2_axs[wx].append(ax) # Compute inverted file: words -> database print('[mc2] building inverted file word -> database') wx2_cxs = np.array([[ax2_cx[ax] for ax in ax_list] for ax_list in wx2_axs]) wx2_fxs = np.array([[ax2_fx[ax] for ax in ax_list] for ax_list in wx2_axs]) # Build sparse visual vectors with term frequency weights print('[mc2] building sparse visual words') coo_cols = ax2_wx coo_rows = ax2_cx coo_values = np.ones(len(ax2_cx), dtype=BOW_DTYPE) coo_format = (coo_values, (coo_rows, coo_cols)) coo_cx2_vvec = spsparse.coo_matrix(coo_format, dtype=np.float, copy=True) cx2_tf_vvec = spsparse.csr_matrix(coo_cx2_vvec, copy=False) # Compute idf_w = log(Number of documents / Number of docs containing word_j) print('[mc2] computing tf-idf') wx2_df = np.array([len(set(cxs))+1 for cxs in wx2_cxs], dtype=np.float) wx2_idf = np.array(np.log2(np.float(num_indexed) / wx2_df)) # Compute tf-idf print('[mc2] preweighting with tf-idf') cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf) # Normalize print('[mc2] normalizing') cx2_tfidf_vvec = algos.sparse_multiply_rows(cx2_tf_vvec, wx2_idf) cx2_vvec = algos.sparse_normalize_rows(cx2_tfidf_vvec) tt2.toc() # Save to cache print('[mc2] saving to cache') r''' input_data = ax2_desc data = cx2_vvec uid='cx2_vvec'+matcher_uid ''' io.smart_save(cx2_vvec, cache_dir, 'cx2_vvec', uid, '.cPkl') #sparse io.smart_save(wx2_cxs, cache_dir, 'wx2_cxs', uid, '.npy') io.smart_save(wx2_fxs, cache_dir, 'wx2_fxs', uid, '.npy') io.smart_save(wx2_idf, cache_dir, 'wx2_idf', uid, '.npy') return cx2_vvec, wx2_cxs, wx2_fxs, wx2_idf