示例#1
0
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcher(m)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Test add_codes
    searcher = LOPQSearcher(m)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)
 def init_lopq(self):
     """ Initialize LOPQ model and searcher from `global_conf` value.
     """
     field = 'SE_lopq'
     if field not in self.global_conf:
         raise ValueError("[Searcher: error] " + field +
                          " is not defined in configuration file.")
     elif self.global_conf[field] == "lopq_pca":
         from lopq.model import LOPQModelPCA
         from lopq.search import LOPQSearcher
         import pickle
         # actually load pickle from disk
         lopq_model_path = self.global_conf['SE_lopqmodel']
         if lopq_model_path.startswith(START_HDFS):
             # deal with HDFS path
             from lopq.utils import copy_from_hdfs
             import shutil
             filename = copy_from_hdfs(lopq_model_path)
             lopq_model = pickle.load(filename)
             try:
                 shutil.rmtree(os.path.dirname(filename))
             except Exception as inst:
                 pass
         else:
             # local path in config
             lopq_model = pickle.load(open(lopq_model_path, "rb"))
         self.searcher_lopq = LOPQSearcher(lopq_model)
     else:
         raise ValueError(
             "[SearcherLOPQHBase: error] unkown 'lopq' type {}.".format(
                 self.global_conf[field]))
示例#3
0
def test_oxford5k():

    random_state = 40
    data = load_oxford_data()
    train, test = train_test_split(data,
                                   test_size=0.2,
                                   random_state=random_state)

    # Compute distance-sorted neighbors in training set for each point in test set
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1, random_state=random_state)

    # Assert correct code computation
    assert_equal(m.predict(test[0]),
                 ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250)))

    # Assert low number of empty cells
    h = get_cell_histogram(train, m)
    assert_equal(np.count_nonzero(h == 0), 6)

    # Assert true NN recall on test set
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with just coarse quantizers
    m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with coarse quantizers and rotations
    m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
示例#4
0
文件: tests.py 项目: agangzz/lopq
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    searcher = LOPQSearcher(m)
    searcher.add_data(data)

    q = np.ones(8)

    retrieved, visited = searcher.get_result_quota(q)
    assert_equal(len(retrieved), 12)
    assert_equal(visited, 3)

    retrieved, visited = searcher.get_result_quota(q, quota=20)
    assert_equal(len(retrieved), 28)
    assert_equal(visited, 5)
    def init_searcher(self):
        """ Initialize LOPQ model and searcher from `global_conf` value.
    """
        try:
            # Try to load pretrained model from storer
            lopq_model = self.storer.load(self.build_model_str())
            if lopq_model is None:
                raise ValueError("Could not load model from storer.")
            # if self.verbose > 1:
            #   print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape))
            #   print("pca_P.shape: {}".format(lopq_model.pca_P.shape))
        except Exception as inst:
            if type(inst) != ValueError:
                full_trace_error(inst)
            print("[{}: log] Looks like model was not trained yet ({})".format(
                self.pp, inst))

            self.loaded_pretrain_model = False
            # Try to get it from public bucket e.g.:
            # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000
            if self.get_pretrained_model:
                log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3"
                print(log_msg.format(self.pp, self.build_model_str()))
                from ..common.dl import download_file
                import pickle
                try:
                    base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/"
                    # This can fail with a "retrieval incomplete: got only" ...
                    download_file(base_model_path + self.build_model_str(),
                                  self.build_model_str())
                    lopq_model = pickle.load(open(self.build_model_str(),
                                                  'rb'))
                    # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket
                    is_s3_storer = isinstance(self.storer, S3Storer)
                    if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex":
                        log_msg = "[{}: log] Skipping saving model {} back to s3"
                        print(log_msg.format(self.pp, self.build_model_str()))
                    else:
                        log_msg = "[{}: log] Saving model {} to storer"
                        print(log_msg.format(self.pp, self.build_model_str()))
                        self.storer.save(self.build_model_str(), lopq_model)
                    log_msg = "[{}: log] Loaded pretrained model {} from s3"
                    print(log_msg.format(self.pp, self.build_model_str()))
                    self.loaded_pretrain_model = True
                except Exception as inst:
                    log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}"
                    #print(log_msg.format(self.pp, self.build_model_str(), inst))
                    full_trace_error(
                        log_msg.format(self.pp, self.build_model_str(), inst))
                    sys.stdout.flush()
            else:
                log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested."
                print(log_msg.format(self.pp, self.build_model_str()))

            if not self.loaded_pretrain_model:
                # This is from our modified LOPQ package...
                # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python
                # 'LOPQModelPCA' could be the type of the model loaded from pickle file
                # from lopq.model import LOPQModel, LOPQModelPCA
                # Size of DB should depend on nb_train... How should we properly set the size of this?
                # It should be nb_train_pca * size_feat + nb_train * size_feat_pca
                feat_size = get_feat_size(self.featurizer_type)
                if self.model_type == "lopq_pca":
                    map_size = self.nb_train_pca * feat_size * 4 * 8
                    map_size += self.nb_train * self.model_params['pca'] * 4 * 8
                else:
                    map_size = self.nb_train * feat_size * 4 * 8
                self.save_feat_env = lmdb.open('/data/lmdb_feats_' +
                                               self.build_model_str(),
                                               map_size=int(1.1 * map_size),
                                               writemap=True,
                                               map_async=True,
                                               max_dbs=2)

                # Train and save model in save_path folder
                lopq_model = self.train_index()
                # TODO: we could build a more unique model identifier
                # (using domain information? sha1/md5 of model parameters? using date of training?)
                # that would also mean we should list from the storer and guess
                # (based on date of creation) the correct model above...
                self.storer.save(self.build_model_str(), lopq_model)

        # Setup searcher with LOPQ model
        if lopq_model:
            # LOPQSearcherLMDB is now the default, as it makes the index more persistent
            # and potentially more easily usable with multiple processes.
            if self.lopq_searcher == "LOPQSearcherLMDB":
                from lopq.search import LOPQSearcherLMDB
                # TODO: should we get path from a parameter? and/or add model_str to it?
                # self.searcher = LOPQSearcherLMDB(lopq_model, lmdb_path='./lmdb_index/', id_lambda=str)
                # self.updates_env = lmdb.open('./lmdb_updates/', map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1)
                self.searcher = LOPQSearcherLMDB(
                    lopq_model,
                    lmdb_path='/data/lmdb_index_' + self.build_model_str(),
                    id_lambda=str)
                # How could we properly set the size of this?
                self.updates_env = lmdb.open('/data/lmdb_updates_' +
                                             self.build_model_str(),
                                             map_size=1024 * 1000000 * 1,
                                             writemap=True,
                                             map_async=True,
                                             max_dbs=1)
                self.updates_index_db = self.updates_env.open_db("updates")
            elif self.lopq_searcher == "LOPQSearcher":
                from lopq.search import LOPQSearcher
                self.searcher = LOPQSearcher(lopq_model)
            else:
                raise ValueError("Unknown 'lopq_searcher' type: {}".format(
                    self.lopq_searcher))