def test_searcher(): data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) q = np.ones(8) # Test add_data searcher = LOPQSearcher(m) searcher.add_data(data) searcher_instance_battery(searcher, q) # Test add_codes searcher = LOPQSearcher(m) codes = [m.predict(x) for x in data] searcher.add_codes(codes) searcher_instance_battery(searcher, q)
def init_lopq(self): """ Initialize LOPQ model and searcher from `global_conf` value. """ field = 'SE_lopq' if field not in self.global_conf: raise ValueError("[Searcher: error] " + field + " is not defined in configuration file.") elif self.global_conf[field] == "lopq_pca": from lopq.model import LOPQModelPCA from lopq.search import LOPQSearcher import pickle # actually load pickle from disk lopq_model_path = self.global_conf['SE_lopqmodel'] if lopq_model_path.startswith(START_HDFS): # deal with HDFS path from lopq.utils import copy_from_hdfs import shutil filename = copy_from_hdfs(lopq_model_path) lopq_model = pickle.load(filename) try: shutil.rmtree(os.path.dirname(filename)) except Exception as inst: pass else: # local path in config lopq_model = pickle.load(open(lopq_model_path, "rb")) self.searcher_lopq = LOPQSearcher(lopq_model) else: raise ValueError( "[SearcherLOPQHBase: error] unkown 'lopq' type {}.".format( self.global_conf[field]))
def test_oxford5k(): random_state = 40 data = load_oxford_data() train, test = train_test_split(data, test_size=0.2, random_state=random_state) # Compute distance-sorted neighbors in training set for each point in test set nns = compute_all_neighbors(test, train) # Fit model m = LOPQModel(V=16, M=8) m.fit(train, n_init=1, random_state=random_state) # Assert correct code computation assert_equal(m.predict(test[0]), ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250))) # Assert low number of empty cells h = get_cell_histogram(train, m) assert_equal(np.count_nonzero(h == 0), 6) # Assert true NN recall on test set searcher = LOPQSearcher(m) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97])) # Test partial fitting with just coarse quantizers m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1, random_state=random_state) searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97])) # Test partial fitting with coarse quantizers and rotations m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1, random_state=random_state) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
def test_searcher(): data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) searcher = LOPQSearcher(m) searcher.add_data(data) q = np.ones(8) retrieved, visited = searcher.get_result_quota(q) assert_equal(len(retrieved), 12) assert_equal(visited, 3) retrieved, visited = searcher.get_result_quota(q, quota=20) assert_equal(len(retrieved), 28) assert_equal(visited, 5)
def init_searcher(self): """ Initialize LOPQ model and searcher from `global_conf` value. """ try: # Try to load pretrained model from storer lopq_model = self.storer.load(self.build_model_str()) if lopq_model is None: raise ValueError("Could not load model from storer.") # if self.verbose > 1: # print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape)) # print("pca_P.shape: {}".format(lopq_model.pca_P.shape)) except Exception as inst: if type(inst) != ValueError: full_trace_error(inst) print("[{}: log] Looks like model was not trained yet ({})".format( self.pp, inst)) self.loaded_pretrain_model = False # Try to get it from public bucket e.g.: # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000 if self.get_pretrained_model: log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3" print(log_msg.format(self.pp, self.build_model_str())) from ..common.dl import download_file import pickle try: base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/" # This can fail with a "retrieval incomplete: got only" ... download_file(base_model_path + self.build_model_str(), self.build_model_str()) lopq_model = pickle.load(open(self.build_model_str(), 'rb')) # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket is_s3_storer = isinstance(self.storer, S3Storer) if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex": log_msg = "[{}: log] Skipping saving model {} back to s3" print(log_msg.format(self.pp, self.build_model_str())) else: log_msg = "[{}: log] Saving model {} to storer" print(log_msg.format(self.pp, self.build_model_str())) self.storer.save(self.build_model_str(), lopq_model) log_msg = "[{}: log] Loaded pretrained model {} from s3" print(log_msg.format(self.pp, self.build_model_str())) self.loaded_pretrain_model = True except Exception as inst: log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}" #print(log_msg.format(self.pp, self.build_model_str(), inst)) full_trace_error( log_msg.format(self.pp, self.build_model_str(), inst)) sys.stdout.flush() else: log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested." print(log_msg.format(self.pp, self.build_model_str())) if not self.loaded_pretrain_model: # This is from our modified LOPQ package... # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python # 'LOPQModelPCA' could be the type of the model loaded from pickle file # from lopq.model import LOPQModel, LOPQModelPCA # Size of DB should depend on nb_train... How should we properly set the size of this? # It should be nb_train_pca * size_feat + nb_train * size_feat_pca feat_size = get_feat_size(self.featurizer_type) if self.model_type == "lopq_pca": map_size = self.nb_train_pca * feat_size * 4 * 8 map_size += self.nb_train * self.model_params['pca'] * 4 * 8 else: map_size = self.nb_train * feat_size * 4 * 8 self.save_feat_env = lmdb.open('/data/lmdb_feats_' + self.build_model_str(), map_size=int(1.1 * map_size), writemap=True, map_async=True, max_dbs=2) # Train and save model in save_path folder lopq_model = self.train_index() # TODO: we could build a more unique model identifier # (using domain information? sha1/md5 of model parameters? using date of training?) # that would also mean we should list from the storer and guess # (based on date of creation) the correct model above... self.storer.save(self.build_model_str(), lopq_model) # Setup searcher with LOPQ model if lopq_model: # LOPQSearcherLMDB is now the default, as it makes the index more persistent # and potentially more easily usable with multiple processes. if self.lopq_searcher == "LOPQSearcherLMDB": from lopq.search import LOPQSearcherLMDB # TODO: should we get path from a parameter? and/or add model_str to it? # self.searcher = LOPQSearcherLMDB(lopq_model, lmdb_path='./lmdb_index/', id_lambda=str) # self.updates_env = lmdb.open('./lmdb_updates/', map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1) self.searcher = LOPQSearcherLMDB( lopq_model, lmdb_path='/data/lmdb_index_' + self.build_model_str(), id_lambda=str) # How could we properly set the size of this? self.updates_env = lmdb.open('/data/lmdb_updates_' + self.build_model_str(), map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1) self.updates_index_db = self.updates_env.open_db("updates") elif self.lopq_searcher == "LOPQSearcher": from lopq.search import LOPQSearcher self.searcher = LOPQSearcher(lopq_model) else: raise ValueError("Unknown 'lopq_searcher' type: {}".format( self.lopq_searcher))