def init_lopq(self):
     """ Initialize LOPQ model and searcher from `global_conf` value.
     """
     field = 'SE_lopq'
     if field not in self.global_conf:
         raise ValueError("[Searcher: error] " + field +
                          " is not defined in configuration file.")
     elif self.global_conf[field] == "lopq_pca":
         from lopq.model import LOPQModelPCA
         from lopq.search import LOPQSearcher
         import pickle
         # actually load pickle from disk
         lopq_model_path = self.global_conf['SE_lopqmodel']
         if lopq_model_path.startswith(START_HDFS):
             # deal with HDFS path
             from lopq.utils import copy_from_hdfs
             import shutil
             filename = copy_from_hdfs(lopq_model_path)
             lopq_model = pickle.load(filename)
             try:
                 shutil.rmtree(os.path.dirname(filename))
             except Exception as inst:
                 pass
         else:
             # local path in config
             lopq_model = pickle.load(open(lopq_model_path, "rb"))
         self.searcher_lopq = LOPQSearcher(lopq_model)
     else:
         raise ValueError(
             "[SearcherLOPQHBase: error] unkown 'lopq' type {}.".format(
                 self.global_conf[field]))
Exemplo n.º 2
0
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcher(m)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Test add_codes
    searcher = LOPQSearcher(m)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)
Exemplo n.º 3
0
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    searcher = LOPQSearcher(m)
    searcher.add_data(data)

    q = np.ones(8)

    retrieved, visited = searcher.get_result_quota(q)
    assert_equal(len(retrieved), 12)
    assert_equal(visited, 3)

    retrieved, visited = searcher.get_result_quota(q, quota=20)
    assert_equal(len(retrieved), 28)
    assert_equal(visited, 5)
    def init_searcher(self):
        """ Initialize LOPQ model and searcher from `global_conf` value.
    """
        try:
            # Try to load pretrained model from storer
            lopq_model = self.storer.load(self.build_model_str())
            if lopq_model is None:
                raise ValueError("Could not load model from storer.")
            # if self.verbose > 1:
            #   print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape))
            #   print("pca_P.shape: {}".format(lopq_model.pca_P.shape))
        except Exception as inst:
            if type(inst) != ValueError:
                full_trace_error(inst)
            print("[{}: log] Looks like model was not trained yet ({})".format(
                self.pp, inst))

            self.loaded_pretrain_model = False
            # Try to get it from public bucket e.g.:
            # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000
            if self.get_pretrained_model:
                log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3"
                print(log_msg.format(self.pp, self.build_model_str()))
                from ..common.dl import download_file
                import pickle
                try:
                    base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/"
                    # This can fail with a "retrieval incomplete: got only" ...
                    download_file(base_model_path + self.build_model_str(),
                                  self.build_model_str())
                    lopq_model = pickle.load(open(self.build_model_str(),
                                                  'rb'))
                    # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket
                    is_s3_storer = isinstance(self.storer, S3Storer)
                    if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex":
                        log_msg = "[{}: log] Skipping saving model {} back to s3"
                        print(log_msg.format(self.pp, self.build_model_str()))
                    else:
                        log_msg = "[{}: log] Saving model {} to storer"
                        print(log_msg.format(self.pp, self.build_model_str()))
                        self.storer.save(self.build_model_str(), lopq_model)
                    log_msg = "[{}: log] Loaded pretrained model {} from s3"
                    print(log_msg.format(self.pp, self.build_model_str()))
                    self.loaded_pretrain_model = True
                except Exception as inst:
                    log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}"
                    #print(log_msg.format(self.pp, self.build_model_str(), inst))
                    full_trace_error(
                        log_msg.format(self.pp, self.build_model_str(), inst))
                    sys.stdout.flush()
            else:
                log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested."
                print(log_msg.format(self.pp, self.build_model_str()))

            if not self.loaded_pretrain_model:
                # This is from our modified LOPQ package...
                # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python
                # 'LOPQModelPCA' could be the type of the model loaded from pickle file
                # from lopq.model import LOPQModel, LOPQModelPCA
                # Size of DB should depend on nb_train... How should we properly set the size of this?
                # It should be nb_train_pca * size_feat + nb_train * size_feat_pca
                feat_size = get_feat_size(self.featurizer_type)
                if self.model_type == "lopq_pca":
                    map_size = self.nb_train_pca * feat_size * 4 * 8
                    map_size += self.nb_train * self.model_params['pca'] * 4 * 8
                else:
                    map_size = self.nb_train * feat_size * 4 * 8
                self.save_feat_env = lmdb.open('/data/lmdb_feats_' +
                                               self.build_model_str(),
                                               map_size=int(1.1 * map_size),
                                               writemap=True,
                                               map_async=True,
                                               max_dbs=2)

                # Train and save model in save_path folder
                lopq_model = self.train_index()
                # TODO: we could build a more unique model identifier
                # (using domain information? sha1/md5 of model parameters? using date of training?)
                # that would also mean we should list from the storer and guess
                # (based on date of creation) the correct model above...
                self.storer.save(self.build_model_str(), lopq_model)

        # Setup searcher with LOPQ model
        if lopq_model:
            # LOPQSearcherLMDB is now the default, as it makes the index more persistent
            # and potentially more easily usable with multiple processes.
            if self.lopq_searcher == "LOPQSearcherLMDB":
                from lopq.search import LOPQSearcherLMDB
                # TODO: should we get path from a parameter? and/or add model_str to it?
                # self.searcher = LOPQSearcherLMDB(lopq_model, lmdb_path='./lmdb_index/', id_lambda=str)
                # self.updates_env = lmdb.open('./lmdb_updates/', map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1)
                self.searcher = LOPQSearcherLMDB(
                    lopq_model,
                    lmdb_path='/data/lmdb_index_' + self.build_model_str(),
                    id_lambda=str)
                # How could we properly set the size of this?
                self.updates_env = lmdb.open('/data/lmdb_updates_' +
                                             self.build_model_str(),
                                             map_size=1024 * 1000000 * 1,
                                             writemap=True,
                                             map_async=True,
                                             max_dbs=1)
                self.updates_index_db = self.updates_env.open_db("updates")
            elif self.lopq_searcher == "LOPQSearcher":
                from lopq.search import LOPQSearcher
                self.searcher = LOPQSearcher(lopq_model)
            else:
                raise ValueError("Unknown 'lopq_searcher' type: {}".format(
                    self.lopq_searcher))
class SearcherLOPQHBase(GenericSearcher):
    def __init__(self, global_conf_in, prefix=default_prefix):
        # number of processors to use for parallel computation of codes
        self.num_procs = 8  # could be read from configuration
        self.model_params = None
        self.get_pretrained_model = True
        self.nb_train_pca = 100000
        self.last_refresh = datetime.now()
        self.last_full_refresh = datetime.now()
        self.last_indexed_update = None
        self.pca_model_str = None
        # making LOPQSearcherLMDB the default LOPQSearcher
        self.lopq_searcher = "LOPQSearcherLMDB"
        super(SearcherLOPQHBase, self).__init__(global_conf_in, prefix)

    def get_model_params(self):
        V = self.get_required_param('lopq_V')
        M = self.get_required_param('lopq_M')
        subq = self.get_required_param('lopq_subq')
        # we could use that for a more fine grained model naming...
        self.model_params = {'V': V, 'M': M, 'subq': subq}
        if self.model_type == "lopq_pca":
            # Number of dimensions to keep after PCA
            pca = self.get_required_param('lopq_pcadims')
            self.model_params['pca'] = pca
            nb_train_pca = self.get_required_param('nb_train_pca')
            self.nb_train_pca = nb_train_pca
            nb_min_train_pca = self.get_param('nb_min_train_pca')
            if nb_min_train_pca:
                self.nb_min_train_pca = nb_min_train_pca
        lopq_searcher = self.get_param('lopq_searcher')
        if lopq_searcher:
            self.lopq_searcher = lopq_searcher

    def build_pca_model_str(self):
        # Use feature type, self.nb_train_pca and pca_dims
        if self.pca_model_str is None:
            # We could add some additional info: model parameters, number of samples used for training...
            self.pca_model_str = self.build_extr_str() + "_pca" + str(
                self.model_params['pca'])
            self.pca_model_str += "_train" + str(self.nb_train_pca)
        return self.pca_model_str

    def set_pp(self):
        self.pp = "SearcherLOPQHBase"

    def init_searcher(self):
        """ Initialize LOPQ model and searcher from `global_conf` value.
    """
        try:
            # Try to load pretrained model from storer
            lopq_model = self.storer.load(self.build_model_str())
            if lopq_model is None:
                raise ValueError("Could not load model from storer.")
            # if self.verbose > 1:
            #   print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape))
            #   print("pca_P.shape: {}".format(lopq_model.pca_P.shape))
        except Exception as inst:
            if type(inst) != ValueError:
                full_trace_error(inst)
            print("[{}: log] Looks like model was not trained yet ({})".format(
                self.pp, inst))

            self.loaded_pretrain_model = False
            # Try to get it from public bucket e.g.:
            # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000
            if self.get_pretrained_model:
                log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3"
                print(log_msg.format(self.pp, self.build_model_str()))
                from ..common.dl import download_file
                import pickle
                try:
                    base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/"
                    # This can fail with a "retrieval incomplete: got only" ...
                    download_file(base_model_path + self.build_model_str(),
                                  self.build_model_str())
                    lopq_model = pickle.load(open(self.build_model_str(),
                                                  'rb'))
                    # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket
                    is_s3_storer = isinstance(self.storer, S3Storer)
                    if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex":
                        log_msg = "[{}: log] Skipping saving model {} back to s3"
                        print(log_msg.format(self.pp, self.build_model_str()))
                    else:
                        log_msg = "[{}: log] Saving model {} to storer"
                        print(log_msg.format(self.pp, self.build_model_str()))
                        self.storer.save(self.build_model_str(), lopq_model)
                    log_msg = "[{}: log] Loaded pretrained model {} from s3"
                    print(log_msg.format(self.pp, self.build_model_str()))
                    self.loaded_pretrain_model = True
                except Exception as inst:
                    log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}"
                    #print(log_msg.format(self.pp, self.build_model_str(), inst))
                    full_trace_error(
                        log_msg.format(self.pp, self.build_model_str(), inst))
                    sys.stdout.flush()
            else:
                log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested."
                print(log_msg.format(self.pp, self.build_model_str()))

            if not self.loaded_pretrain_model:
                # This is from our modified LOPQ package...
                # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python
                # 'LOPQModelPCA' could be the type of the model loaded from pickle file
                # from lopq.model import LOPQModel, LOPQModelPCA
                # Size of DB should depend on nb_train... How should we properly set the size of this?
                # It should be nb_train_pca * size_feat + nb_train * size_feat_pca
                feat_size = get_feat_size(self.featurizer_type)
                if self.model_type == "lopq_pca":
                    map_size = self.nb_train_pca * feat_size * 4 * 8
                    map_size += self.nb_train * self.model_params['pca'] * 4 * 8
                else:
                    map_size = self.nb_train * feat_size * 4 * 8
                self.save_feat_env = lmdb.open('/data/lmdb_feats_' +
                                               self.build_model_str(),
                                               map_size=int(1.1 * map_size),
                                               writemap=True,
                                               map_async=True,
                                               max_dbs=2)

                # Train and save model in save_path folder
                lopq_model = self.train_index()
                # TODO: we could build a more unique model identifier
                # (using domain information? sha1/md5 of model parameters? using date of training?)
                # that would also mean we should list from the storer and guess
                # (based on date of creation) the correct model above...
                self.storer.save(self.build_model_str(), lopq_model)

        # Setup searcher with LOPQ model
        if lopq_model:
            # LOPQSearcherLMDB is now the default, as it makes the index more persistent
            # and potentially more easily usable with multiple processes.
            if self.lopq_searcher == "LOPQSearcherLMDB":
                from lopq.search import LOPQSearcherLMDB
                # TODO: should we get path from a parameter? and/or add model_str to it?
                # self.searcher = LOPQSearcherLMDB(lopq_model, lmdb_path='./lmdb_index/', id_lambda=str)
                # self.updates_env = lmdb.open('./lmdb_updates/', map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1)
                self.searcher = LOPQSearcherLMDB(
                    lopq_model,
                    lmdb_path='/data/lmdb_index_' + self.build_model_str(),
                    id_lambda=str)
                # How could we properly set the size of this?
                self.updates_env = lmdb.open('/data/lmdb_updates_' +
                                             self.build_model_str(),
                                             map_size=1024 * 1000000 * 1,
                                             writemap=True,
                                             map_async=True,
                                             max_dbs=1)
                self.updates_index_db = self.updates_env.open_db("updates")
            elif self.lopq_searcher == "LOPQSearcher":
                from lopq.search import LOPQSearcher
                self.searcher = LOPQSearcher(lopq_model)
            else:
                raise ValueError("Unknown 'lopq_searcher' type: {}".format(
                    self.lopq_searcher))
        # NB: an empty lopq_model would make sense only if we just want to detect...

    def get_feats_from_lmbd(self, feats_db, nb_features, dtype):
        nb_saved_feats = self.get_nb_saved_feats(feats_db)
        nb_feats_to_read = min(nb_saved_feats, nb_features)
        feats = None
        if nb_feats_to_read > 0:
            with self.save_feat_env.begin(db=feats_db, write=False) as txn:
                with txn.cursor() as cursor:
                    if cursor.first():
                        first_item = cursor.item()
                        first_feat = np.frombuffer(first_item[1], dtype=dtype)
                        feats = np.zeros(
                            (nb_feats_to_read, first_feat.shape[0]))
                        print(
                            "[get_feats_from_lmbd] Filling up features matrix: {}"
                            .format(feats.shape))
                        sys.stdout.flush()
                        for i, item in enumerate(cursor.iternext()):
                            if i >= nb_feats_to_read:
                                break
                            feats[i, :] = np.frombuffer(item[1], dtype=dtype)
        return feats

    def save_feats_to_lmbd(self,
                           feats_db,
                           samples_ids,
                           np_features,
                           max_feats=0):
        with self.save_feat_env.begin(db=feats_db, write=True) as txn:
            for i, sid in enumerate(samples_ids):
                txn.put(bytes(sid), np_features[i, :].tobytes())
                nb_feats = txn.stat()['entries']
                if max_feats and nb_feats >= max_feats:
                    return nb_feats
        return nb_feats

    def get_nb_saved_feats(self, feats_db):
        with self.save_feat_env.begin(db=feats_db, write=False) as txn:
            return txn.stat()['entries']

    def get_train_features(self,
                           nb_features,
                           lopq_pca_model=None,
                           nb_min_train=None):
        if nb_min_train is None:
            nb_min_train = nb_features
        if lopq_pca_model:
            feats_db = self.save_feat_env.open_db("feats_pca")
            dtype = np.float32
        else:
            feats_db = self.save_feat_env.open_db("feats")
            from ..featurizer.featsio import get_feat_dtype
            dtype = get_feat_dtype(self.featurizer_type)
        nb_saved_feats = self.get_nb_saved_feats(feats_db)
        nb_features_to_read = nb_features

        seen_updates = set()

        if nb_saved_feats < nb_features:
            print("[{}: log] Gathering {} training samples...".format(
                self.pp, nb_features))
            sys.stdout.flush()
            start_date = "1970-01-01"
            done = False
            # Accumulate until we have enough features, or we have read all features if 'wait_for_nbtrain'
            # is false and we have at least nb_min_train
            while not done:
                for batch_updates in self.indexer.get_updates_from_date(
                        start_date=start_date,
                        extr_type=self.build_extr_str()):
                    for update in batch_updates:
                        # for update in updates:
                        try:
                            # We could check if update has been processed, but if not we won't get features anyway
                            update_id = update[0]
                            if column_list_sha1s in update[1]:
                                if update_id not in seen_updates:
                                    list_sha1s = update[1][column_list_sha1s]
                                    samples_ids, features = self.indexer.get_features_from_sha1s(
                                        list_sha1s.split(','),
                                        self.build_extr_str())
                                    if features:
                                        # Apply PCA to features here to save memory
                                        if lopq_pca_model:
                                            np_features = lopq_pca_model.apply_PCA(
                                                np.asarray(features))
                                        else:
                                            np_features = np.asarray(features)
                                        log_msg = "[{}: log] Got features {} from update {}"
                                        print(
                                            log_msg.format(
                                                self.pp, np_features.shape,
                                                update_id))
                                        sys.stdout.flush()
                                        # just appending like this does not account for duplicates...
                                        # train_features.extend(np_features)
                                        nb_saved_feats = self.save_feats_to_lmbd(
                                            feats_db, samples_ids, np_features)
                                        seen_updates.add(update_id)
                                    else:
                                        if self.verbose > 3:
                                            log_msg = "[{}: log] Did not get features from update {}"
                                            print(
                                                log_msg.format(
                                                    self.pp, update_id))
                                            sys.stdout.flush()
                                    if nb_saved_feats >= nb_features:
                                        done = True
                                        break
                            else:
                                warn_msg = "[{}: warning] Update {} has no list of images associated to it."
                                print(warn_msg.format(self.pp, update_id))
                                sys.stdout.flush()
                        except Exception as inst:
                            from cufacesearch.common.error import full_trace_error
                            err_msg = "[{}: error] Failed to get features: {} {}"
                            full_trace_error(
                                err_msg.format(self.pp, type(inst), inst))
                            sys.stdout.flush()
                        else:
                            if self.verbose > 4:
                                print(
                                    "[{}: log] Got {} training samples so far..."
                                    .format(self.pp, nb_saved_feats))
                                sys.stdout.flush()
                        if done:
                            nb_features_to_read = nb_saved_feats
                            break
                else:
                    if not done:
                        # Wait for new updates...
                        # TODO: could be optional
                        if self.wait_for_nbtrain:
                            if nb_saved_feats >= nb_min_train:
                                log_msg = "[{}: log] Gathered minimum number of training features ({})..."
                                print(log_msg.format(self.pp, nb_min_train))
                                sys.stdout.flush()
                                break
                            else:
                                log_msg = "[{}: log] Waiting for new updates. Got {} training samples so far..."
                                print(log_msg.format(self.pp, nb_saved_feats))
                                sys.stdout.flush()
                                time.sleep(60)
                        else:
                            log_msg = "[{}: log] Gathered all available features ({})..."
                            print(
                                log_msg.format(
                                    self.pp,
                                    self.get_nb_saved_feats(feats_db)))
                            sys.stdout.flush()
                            break

        return self.get_feats_from_lmbd(feats_db, nb_features_to_read, dtype)

    def train_index(self):

        if self.model_type == "lopq":
            train_np = self.get_train_features(self.nb_train,
                                               nb_min_train=self.nb_min_train)
            print("Got train features array with shape: {}".format(
                train_np.shape))
            nb_train_feats = train_np.shape[0]
            sys.stdout.flush()

            if nb_train_feats >= self.nb_train:
                from lopq.model import LOPQModel
                # we could have default values for those parameters and/or heuristic to estimate them based on data count...
                lopq_model = LOPQModel(
                    V=self.model_params['V'],
                    M=self.model_params['M'],
                    subquantizer_clusters=self.model_params['subq'])
                # we could have separate training/indexing features
                msg = "[{}.train_model: info] Starting local training of 'lopq' model with parameters {} using {} features."
                print(msg.format(self.pp, self.model_params, nb_train_feats))
                start_train = time.time()
                # specify a n_init < 10 (default value) to speed-up training?
                lopq_model.fit(train_np, verbose=True)
                # save model
                self.storer.save(self.build_model_str(), lopq_model)
                print(
                    "[{}.train_model: info] Trained lopq model in {}s.".format(
                        self.pp,
                        time.time() - start_train))
                return lopq_model
            else:
                msg = "[{}.train_model: error] Could not train model, not enough training samples."
                print(msg.format(self.pp))

        elif self.model_type == "lopq_pca":
            # lopq_pca training.
            from lopq.model import LOPQModelPCA
            # we could have default values for those parameters
            # and/or heuristic to estimate them based on data count...
            lopq_model = LOPQModelPCA(
                V=self.model_params['V'],
                M=self.model_params['M'],
                subquantizer_clusters=self.model_params['subq'],
                renorm=True)
            # pca loading/training first
            pca_model = self.storer.load(self.build_pca_model_str())
            if pca_model is None:
                train_np = self.get_train_features(
                    self.nb_train_pca, nb_min_train=self.nb_min_train_pca)
                msg = "[{}.train_model: info] Training PCA model, keeping {} dimensions from features {}."
                print(
                    msg.format(self.pp, self.model_params['pca'],
                               train_np.shape))
                sys.stdout.flush()
                start_train_pca = time.time()
                lopq_model.fit_pca(train_np, pca_dims=self.model_params['pca'])
                info_msg = "[{}.train_model: info] Trained pca model in {}s."
                print(info_msg.format(self.pp, time.time() - start_train_pca))
                del train_np
                self.storer.save(self.build_pca_model_str(), {
                    "P": lopq_model.pca_P,
                    "mu": lopq_model.pca_mu
                })
            else:
                lopq_model.pca_P = pca_model["P"]
                lopq_model.pca_mu = pca_model["mu"]
            # train model
            train_np = self.get_train_features(self.nb_train,
                                               lopq_pca_model=lopq_model,
                                               nb_min_train=self.nb_min_train)
            msg = "[{}.train_model: info] Training 'lopq_pca' model with parameters {} using features {}"
            print(msg.format(self.pp, self.model_params, train_np.shape))
            sys.stdout.flush()
            start_train = time.time()
            # specify a n_init < 10 (default value) to speed-up training?
            lopq_model.fit(train_np,
                           verbose=True,
                           apply_pca=False,
                           train_pca=False)
            # TODO: we could evaluate model based on reconstruction of some randomly sampled features?
            # save model
            self.storer.save(self.build_model_str(), lopq_model)
            info_msg = "[{}.train_model: info] Trained lopq model in {}s."
            print(info_msg.format(self.pp, time.time() - start_train))
            sys.stdout.flush()
            return lopq_model
            # err_msg = "[{}.train_model: error] Local training of 'lopq_pca' model not yet implemented."
            # raise NotImplementedError(err_msg.format(self.pp))
        else:
            err_msg = "[{}.train_model: error] Unknown 'lopq' type {}."
            raise ValueError(err_msg.format(self.pp, self.model_type))
        # print train_features_path, os.path.exists(train_features_path), lopq_params

    # TODO: should we try to evaluate index by pushing train_features to a temporary searcher
    #    - compute exhaustive search for some randomly selected samples
    #    - analyze retrieval performance of approximate search?
    # technically we could even explore different configurations...

    def compute_codes(self, det_ids, data, codes_path=None):
        # Compute codes for each update batch and save them
        from lopq.utils import compute_codes_parallel
        msg = "[{}.compute_codes: info] Computing codes for {} {}s."
        print(msg.format(self.pp, len(det_ids), self.input_type))

        # That keeps the ordering intact, but output is a chain
        codes = compute_codes_parallel(data, self.searcher.model,
                                       self.num_procs)

        # Build dict output
        codes_dict = dict()
        for i, code in enumerate(codes):
            codes_dict[det_ids[i]] = [code.coarse, code.fine]

        # Save
        if codes_path:
            self.storer.save(codes_path, codes_dict)

        return codes_dict

    def add_update(self, update_id):
        if self.lopq_searcher == "LOPQSearcherLMDB":
            # Use another LMDB to store updates indexed?
            with self.updates_env.begin(db=self.updates_index_db,
                                        write=True) as txn:
                txn.put(bytes(update_id), bytes(datetime.now()))
        else:
            self.indexed_updates.add(update_id)
        self.last_indexed_update = update_id

    def is_update_indexed(self, update_id):
        if self.lopq_searcher == "LOPQSearcherLMDB":
            with self.updates_env.begin(db=self.updates_index_db,
                                        write=False) as txn:
                found_update = txn.get(bytes(update_id))
                if found_update:
                    return True
                else:
                    return False
        else:
            return update_id in self.indexed_updates

    def get_latest_update_suffix(self):
        if self.last_indexed_update is None:
            if self.lopq_searcher == "LOPQSearcherLMDB":
                # Try to get in from DB
                with self.updates_env.begin(db=self.updates_index_db,
                                            write=False) as txn:
                    with txn.cursor() as cursor:
                        if cursor.last():
                            item = cursor.item()
                            self.last_indexed_update = item[0]
                            suffix = '_'.join(
                                self.last_indexed_update.split('_')[6:])
                        else:  # Would happen on empty db?
                            suffix = "1970-01-01"
            else:
                suffix = "1970-01-01"
        else:
            suffix = '_'.join(self.last_indexed_update.split('_')[6:])
        return suffix

    def load_codes(self, full_refresh=False):
        # Calling this method can also perfom an update of the index
        if not self.searcher:
            info_msg = "[{}.load_codes: info] Not loading codes as searcher is not initialized."
            print(info_msg.format(self.pp))
            return

        start_load = time.time()
        total_compute_time = 0

        try:
            # if self.searcher.nb_indexed == 0:
            #   # We should try to load a concatenation of all unique codes that also contains a list of the corresponding updates...
            #   # fill codes and self.indexed_updates
            #   self.load_all_codes()
            # TODO: try to get date of last update
            start_date = "1970-01-01"
            if not full_refresh:
                start_date = self.get_latest_update_suffix()

            # Get all updates ids for the extraction type
            # TODO: this scan makes the API unresponsive for ~2 minutes during the update process...
            for batch_updates in self.indexer.get_updates_from_date(
                    start_date=start_date, extr_type=self.build_extr_str()):
                for update in batch_updates:
                    # print "[{}: log] batch length: {}, update length: {}".format(self.pp, len(batch_updates),len(update))
                    update_id = update[0]
                    if self.is_update_indexed(update_id):
                        print("[{}: log] Skipping update {} already indexed.".
                              format(self.pp, update_id))
                        # What if the update was indexed with only partial extractions?
                        # TODO: If full_refresh we should check if indexing time is bigger than processing time...
                    else:
                        if "info:" + update_str_processed in update[1]:
                            print("[{}: log] Looking for codes of update {}".
                                  format(self.pp, update_id))
                            # Get this update codes
                            codes_string = self.build_codes_string(update_id)
                            try:
                                # Check for precomputed codes
                                codes_dict = self.storer.load(codes_string,
                                                              silent=True)
                                if codes_dict is None:
                                    raise ValueError(
                                        'Could not load codes: {}'.format(
                                            codes_string))
                                # TODO: If full_refresh, check that we have as many codes as available features?
                            except Exception as inst:
                                # Update codes not available
                                if self.verbose > 1:
                                    log_msg = "[{}: log] Update {} codes could not be loaded: {}"
                                    print(
                                        log_msg.format(self.pp, update_id,
                                                       inst))
                                # Compute codes for update not yet processed and save them
                                start_compute = time.time()
                                # Get detections (if any) and features...
                                if column_list_sha1s in update[1]:
                                    list_sha1s = update[1][column_list_sha1s]
                                    # Double check that this gets properly features of detections
                                    samples_ids, features = self.indexer.get_features_from_sha1s(
                                        list_sha1s.split(','),
                                        self.build_extr_str())
                                    # FIXME: Legacy dlib features seems to be float32...
                                    # Dirty fix for now. Should run workflow fix_feat_type in legacy branch
                                    if features:
                                        if features[0].shape[-1] < 128:
                                            samples_ids, features = self.indexer.get_features_from_sha1s(
                                                list_sha1s.split(','),
                                                self.build_extr_str(),
                                                "float32")
                                            if features:
                                                forced_msg = "Forced decoding of features as float32"
                                                forced_msg += ". Got {} samples, features with shape {}"
                                                print(
                                                    forced_msg.format(
                                                        len(samples_ids),
                                                        features[0].shape))
                                        codes_dict = self.compute_codes(
                                            samples_ids, features,
                                            codes_string)
                                        update_compute_time = time.time(
                                        ) - start_compute
                                        total_compute_time += update_compute_time
                                        if self.verbose > 0:
                                            log_msg = "[{}: log] Update {} codes computation done in {}s"
                                            print(
                                                log_msg.format(
                                                    self.pp, update_id,
                                                    update_compute_time))
                                    else:
                                        #index_update_dlib_feat_dlib_face_2017-12-18_83-ec25-1513640608.49
                                        print(
                                            "[{}: warning] Update {} has no features."
                                            .format(self.pp, update_id))
                                        continue
                                else:
                                    print(
                                        "[{}: warning] Update {} has no list of images."
                                        .format(self.pp, update_id))
                                    continue

                            # Use new method add_codes_from_dict of searcher
                            self.searcher.add_codes_from_dict(codes_dict)
                            # TODO: indexed_updates should be made persistent too, and add indexing time
                            self.add_update(update_id)

                        else:
                            print("[{}: log] Skipping unprocessed update {}".
                                  format(self.pp, update_id))
                    # TODO: we could check that update processing time was older than indexing time, otherwise that means that
                    #    the update has been reprocessed and should be re-indexed.

            total_load = time.time() - start_load
            self.last_refresh = datetime.now()

            print("[{}: log] Total udpates computation time is: {}s".format(
                self.pp, total_compute_time))
            print("[{}: log] Total udpates loading time is: {}s".format(
                self.pp, total_load))

        except Exception as inst:
            print("[{}: error] Could not load codes. {}".format(self.pp, inst))

    # def load_all_codes(self):
    #   # load self.indexed_updates, self.searcher.index and self.searcher.nb_indexed
    #   # NOT for LOPQSearcherLMDB
    #   pass
    #
    # def save_all_codes(self):
    #   # we should save self.indexed_updates, self.searcher.index and self.searcher.nb_indexed
    #   # self.searcher.index could be big, how to save without memory issue...
    #   # NOT for LOPQSearcherLMDB
    #   pass

    def search_from_feats(self, dets, feats, options_dict=dict()):
        import time
        start_search = time.time()
        all_sim_images = []
        all_sim_dets = []
        all_sim_score = []

        # check what is the near duplicate config
        filter_near_dup = False
        if (self.near_dup and "near_dup" not in options_dict) or (
                "near_dup" in options_dict and options_dict["near_dup"]):
            filter_near_dup = True
            if "near_dup_th" in options_dict:
                near_dup_th = options_dict["near_dup_th"]
            else:
                near_dup_th = self.near_dup_th

        max_returned = self.sim_limit
        if "max_returned" in options_dict:
            max_returned = options_dict["max_returned"]
        # this should be set with a parameter either in conf or options_dict too.
        # should we use self.quota here? and potentially overwrite from options_dict
        quota = min(1000 * max_returned, 10000)

        # print dets
        if self.detector is not None:
            # query for each feature
            for i in range(len(dets)):

                sim_images = []
                sim_dets = []
                sim_score = []

                for j in range(len(dets[i][1])):
                    results = []
                    if "detect_only" not in options_dict or not options_dict[
                            "detect_only"]:
                        if self.searcher:
                            # Normalize feature first as it is how it is done during extraction...
                            norm_feat = np.linalg.norm(feats[i][j])
                            normed_feat = np.squeeze(feats[i][j] / norm_feat)
                            results, visited = self.searcher.search(
                                normed_feat,
                                quota=quota,
                                limit=max_returned,
                                with_dists=True)
                            res_msg = "[{}.search_from_feats: log] got {} results by visiting {} cells, first one is: {}"
                            print(
                                res_msg.format(self.pp, len(results), visited,
                                               results[0]))

                    # If reranking, get features from hbase for detections using res.id
                    #   we could also already get 's3_url' to avoid a second call to HBase later...
                    if self.reranking:
                        try:
                            res_list_sha1s = [
                                str(x.id).split('_')[0] for x in results
                            ]
                            res_samples_ids, res_features = self.indexer.get_features_from_sha1s(
                                res_list_sha1s, self.build_extr_str())
                            # FIXME: dirty fix for dlib features size issue.
                            # To be removed once workflow applied on all legacy data
                            if res_features is not None and res_features[
                                    0].shape[-1] < 128:
                                res_samples_ids, res_features = self.indexer.get_features_from_sha1s(
                                    res_list_sha1s, self.build_extr_str(),
                                    "float32")
                                if res_features:
                                    forced_msg = "Forced decoding of features as float32. Got {} samples, features with shape {}"
                                    print(
                                        forced_msg.format(
                                            len(res_samples_ids),
                                            res_features[0].shape))
                        except Exception as inst:
                            err_msg = "[{}: error] Could not retrieve features for re-ranking. {}"
                            print(err_msg.format(self.pp, inst))

                    tmp_img_sim = []
                    tmp_dets_sim_ids = []
                    tmp_dets_sim_score = []
                    for ires, res in enumerate(results):
                        dist = res.dist
                        # if reranking compute actual distance
                        if self.reranking:
                            try:
                                pos = res_samples_ids.index(res.id)
                                dist = np.linalg.norm(normed_feat -
                                                      res_features[pos])
                                # print "[{}: res_features[{}] approx. dist: {}, rerank dist: {}".format(res.id, pos, res.dist, dist)
                            except Exception as inst:
                                # Means feature was not saved to backend index...
                                err_msg = "Could not compute reranking distance for sample {}, error {} {}"
                                print(err_msg.format(res.id, type(inst), inst))
                        if (filter_near_dup and
                                dist <= near_dup_th) or not filter_near_dup:
                            if not max_returned or (max_returned
                                                    and ires < max_returned):
                                tmp_dets_sim_ids.append(res.id)
                                # here id would be face_id that we could build as sha1_facebbox?
                                tmp_img_sim.append(str(res.id).split('_')[0])
                                tmp_dets_sim_score.append(dist)

                    # If reranking, we need to reorder
                    if self.reranking:
                        sids = np.argsort(tmp_dets_sim_score, axis=0)
                        rerank_img_sim = []
                        rerank_dets_sim_ids = []
                        rerank_dets_sim_score = []
                        for si in sids:
                            rerank_img_sim.append(tmp_img_sim[si])
                            rerank_dets_sim_ids.append(tmp_dets_sim_ids[si])
                            rerank_dets_sim_score.append(
                                tmp_dets_sim_score[si])
                        tmp_img_sim = rerank_img_sim
                        tmp_dets_sim_ids = rerank_dets_sim_ids
                        tmp_dets_sim_score = rerank_dets_sim_score

                    # print tmp_img_sim
                    if tmp_img_sim:
                        rows = []
                        try:
                            rows = self.indexer.get_columns_from_sha1_rows(
                                tmp_img_sim, self.needed_output_columns)
                        except Exception as inst:
                            err_msg = "[{}: error] Could not retrieve similar images info from indexer. {}"
                            print(err_msg.format(self.pp, inst))
                        # rows should contain id, s3_url of images
                        # print rows
                        if not rows:
                            sim_images.append([(x, ) for x in tmp_img_sim])
                        elif len(rows) < len(tmp_img_sim) or not rows:
                            # fall back to just sha1s... but beware to keep order...
                            dec = 0
                            fixed_rows = []
                            for pos, sha1 in tmp_img_sim:
                                if rows[pos - dec][0] == sha1:
                                    fixed_rows.append(rows[pos - dec])
                                else:
                                    dec += 1
                                    fixed_rows.append((sha1, ))
                            sim_images.append(fixed_rows)
                        else:
                            sim_images.append(rows)
                        sim_dets.append(tmp_dets_sim_ids)
                        sim_score.append(tmp_dets_sim_score)
                    else:
                        sim_images.append([])
                        sim_dets.append([])
                        sim_score.append([])

                all_sim_images.append(sim_images)
                all_sim_dets.append(sim_dets)
                all_sim_score.append(sim_score)
        else:
            # No detection
            results = []
            sim_images = []
            sim_score = []

            for i in range(len(feats)):
                if self.searcher:
                    # Normalize feature first as it is how it is done during extraction...
                    norm_feat = np.linalg.norm(feats[i])
                    normed_feat = np.squeeze(feats[i] / norm_feat)
                    results, visited = self.searcher.search(normed_feat,
                                                            quota=quota,
                                                            limit=max_returned,
                                                            with_dists=True)
                    res_msg = "[{}.search_from_feats: log] got {} results by visiting {} cells, first one is: {}"
                    print(
                        res_msg.format(self.pp, len(results), visited,
                                       results[0]))

                # Reranking, get features from hbase for detections using res.id
                if self.reranking:
                    try:
                        res_list_sha1s = [str(x.id) for x in results]
                        res_samples_ids, res_features = self.indexer.get_features_from_sha1s(
                            res_list_sha1s, self.build_extr_str())
                    except Exception as inst:
                        err_msg = "[{}: error] Could not retrieve features for re-ranking. {}"
                        print(err_msg.format(self.pp, inst))

                tmp_img_sim = []
                tmp_sim_score = []
                for ires, res in enumerate(results):
                    dist = res.dist
                    if self.reranking:
                        # If reranking compute actual distance
                        try:
                            pos = res_samples_ids.index(res.id)
                            dist = np.linalg.norm(normed_feat -
                                                  res_features[pos])
                            # print "[{}: res_features[{}] approx. dist: {}, rerank dist: {}".format(res.id, pos, res.dist, dist)
                        except Exception as inst:
                            err_msg = "Could not compute reranked distance for sample {}, error {} {}"
                            print(err_msg.format(res.id, type(inst), inst))
                    if (filter_near_dup
                            and dist <= near_dup_th) or not filter_near_dup:
                        if not max_returned or (max_returned
                                                and ires < max_returned):
                            tmp_img_sim.append(str(res.id))
                            tmp_sim_score.append(dist)

                # If reranking, we need to reorder
                if self.reranking:
                    sids = np.argsort(tmp_sim_score, axis=0)
                    rerank_img_sim = []
                    rerank_sim_score = []
                    for si in sids:
                        rerank_img_sim.append(tmp_img_sim[si])
                        rerank_sim_score.append(tmp_sim_score[si])
                    tmp_img_sim = rerank_img_sim
                    tmp_sim_score = rerank_sim_score

                if tmp_img_sim:
                    rows = []
                    try:
                        rows = self.indexer.get_columns_from_sha1_rows(
                            tmp_img_sim, self.needed_output_columns)
                    except Exception as inst:
                        err_msg = "[{}: error] Could not retrieve similar images info from indexer. {}"
                        print(err_msg.format(self.pp, inst))
                    # rows should contain id, s3_url of images
                    # print rows
                    sim_images.append(rows)
                    sim_score.append(tmp_sim_score)
                else:
                    sim_images.append([])
                    sim_score.append([])

            all_sim_images.append(sim_images)
            all_sim_dets.append([])
            all_sim_score.append(sim_score)

        search_time = time.time() - start_search
        print("[{}: log] Search performed in {:0.3}s.".format(
            self.pp, search_time))

        # format output
        # print "all_sim_images",all_sim_images
        # print "all_sim_dets",all_sim_dets
        # print "all_sim_score",all_sim_score
        return self.do.format_output(dets, all_sim_images, all_sim_dets,
                                     all_sim_score, options_dict,
                                     self.input_type)
Exemplo n.º 6
0
class SearcherLOPQHBase(GenericSearcher):
  """SearcherLOPQHBase
  """

  def __init__(self, global_conf_in, prefix=default_prefix):
    """SearcherLOPQHBase constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    """
    # number of processors to use for parallel computation of codes
    self.num_procs = 8  # could be read from configuration
    self.model_params = None
    self.get_pretrained_model = True
    self.nb_train_pca = 100000
    self.last_refresh = datetime.now()
    # NB: in load_codes full_refresh default is false...
    self.last_full_refresh = datetime.now()
    self.last_indexed_update = None
    self.pca_model_str = None
    self.skipfailed = False
    # making LOPQSearcherLMDB the default LOPQSearcher
    self.lopq_searcher = "LOPQSearcherLMDB"
    super(SearcherLOPQHBase, self).__init__(global_conf_in, prefix)
    self.set_pp(pp="SearcherLOPQHBase")
    #self.set_pp(pp="SearcherLOPQHBase." + str(os.getpid()))

  def get_model_params(self):
    """Reads model parameters from configuration

    Required parameters:

    - ``lopq_V``
    - ``lopq_M``
    - ``lopq_subq``
    - ``lopq_pcadims``
    - ``nb_train_pca``

    Optional parameters:

    - ``nb_min_train_pca``
    - ``lopq_searcher``
    - ``skipfailed``
    """
    V = self.get_required_param('lopq_V')
    M = self.get_required_param('lopq_M')
    subq = self.get_required_param('lopq_subq')
    # we could use that for a more fine grained model naming...
    self.model_params = {'V': V, 'M': M, 'subq': subq}
    if self.model_type == "lopq_pca":
      # Number of dimensions to keep after PCA
      self.model_params['pca'] = self.get_required_param('lopq_pcadims')
      self.nb_train_pca = self.get_required_param('nb_train_pca')
      nb_min_train_pca = self.get_param('nb_min_train_pca')
      if nb_min_train_pca:
        self.nb_min_train_pca = nb_min_train_pca
    self.lopq_searcher = self.get_param('lopq_searcher', default="LOPQSearcherLMDB")
    self.skipfailed = self.get_param('skipfailed', default=False)

  def build_pca_model_str(self):
    """Build PCA model string

    :return: PCA model string
    :rtype: str
    """
    # Use feature type, self.nb_train_pca and pca_dims
    if self.pca_model_str is None:
      # We could add some additional info: model parameters, number of samples used for training...
      self.pca_model_str = self.build_extr_str() + "_pca" + str(self.model_params['pca'])
      self.pca_model_str += "_train" + str(self.nb_train_pca)
    return self.pca_model_str

  def init_searcher(self):
    """ Initialize LOPQ model and searcher from configuration values
    """
    try:
      # Try to load pretrained model from storer
      # This can fail with error "exceptions.MemoryError" ?
      lopq_model = self.storer.load(self.build_model_str())
      if lopq_model is None:
        raise ValueError("Could not load model from storer.")
      # if self.verbose > 1:
      #   print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape))
      #   print("pca_P.shape: {}".format(lopq_model.pca_P.shape))
    except Exception as inst:
      if type(inst) != ValueError:
        full_trace_error(inst)
      print("[{}: log] Looks like model was not trained yet ({})".format(self.pp, inst))

      self.loaded_pretrain_model = False
      # Try to get it from public bucket e.g.:
      # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000
      if self.get_pretrained_model:
        log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3"
        print(log_msg.format(self.pp, self.build_model_str()))
        from ..common.dl import download_file
        import pickle
        try:
          # TODO: fallback bucket_name could be loaded dynamically from conf file...
          base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/"
          # This can fail with a "retrieval incomplete: got only" ...
          # Or can stall... why?
          download_file(base_model_path + self.build_model_str(), self.build_model_str())
          lopq_model = pickle.load(open(self.build_model_str(), 'rb'))
          # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket
          is_s3_storer = isinstance(self.storer, S3Storer)
          if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex":
            log_msg = "[{}: log] Skipping saving model {} back to s3"
            print(log_msg.format(self.pp, self.build_model_str()))
          else:
            log_msg = "[{}: log] Saving model {} to storer"
            print(log_msg.format(self.pp, self.build_model_str()))
            self.storer.save(self.build_model_str(), lopq_model)
          log_msg = "[{}: log] Loaded pretrained model {} from s3"
          print(log_msg.format(self.pp, self.build_model_str()))
          self.loaded_pretrain_model = True
        except Exception as inst:
          log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}"
          #print(log_msg.format(self.pp, self.build_model_str(), inst))
          full_trace_error(log_msg.format(self.pp, self.build_model_str(), inst))
          sys.stdout.flush()
      else:
        log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested."
        print(log_msg.format(self.pp, self.build_model_str()))

      if not self.loaded_pretrain_model:
        # This is from our modified LOPQ package...
        # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python
        # 'LOPQModelPCA' could be the type of the model loaded from pickle file
        # from lopq.model import LOPQModel, LOPQModelPCA
        # Size of DB should depend on nb_train... How should we properly set the size of this?
        # It should be nb_train_pca * size_feat + nb_train * size_feat_pca
        feat_size = get_feat_size(self.featurizer_type)
        if self.model_type == "lopq_pca":
          map_size = self.nb_train_pca * feat_size * 4 * 8
          map_size += self.nb_train * self.model_params['pca'] * 4 * 8
        else:
          map_size = self.nb_train * feat_size * 4 * 8
        # self.save_feat_env = lmdb.open('/data/lmdb_feats_' + self.build_model_str(),
        #                                map_size=int(1.1 * map_size),
        #                                writemap=True, map_async=True, max_dbs=2)
        self.save_feat_env = lmdb.open('/data/lmdb_feats_' + self.build_model_str(),
                                       map_size=int(1.1 * map_size), max_dbs=2)

        # Train and save model in save_path folder
        lopq_model = self.train_index()
        # TODO: we could build a more unique model identifier
        # (using domain information? sha1/md5 of model parameters? using date of training?)
        # that would also mean we should list from the storer and guess
        # (based on date of creation) the correct model above...
        self.storer.save(self.build_model_str(), lopq_model)

    # Setup searcher with LOPQ model
    if lopq_model:
      # LOPQSearcherLMDB is now the default, as it makes the index more persistent
      # and potentially more easily usable with multiple processes.
      if self.lopq_searcher == "LOPQSearcherLMDB":
        from lopq.search import LOPQSearcherLMDB
        # TODO: should we get path from a parameter? and/or add model_str to it?
        # path are inside the docker container only...
        self.searcher = LOPQSearcherLMDB(lopq_model,
                                         lmdb_path='/data/lmdb_index_' + self.build_model_str(),
                                         id_lambda=str)
        # Should we move all that updates related lmbd in a call for each thread?
        # How could we properly set the size of this?
        up_map_size = 1024 * 1000000 * 1
        # Again (see lopq.search LOPQSearcherLMDB), should we use writemap=True or not
        # self.updates_env = lmdb.open('/data/lmdb_updates_' + self.build_model_str(),
        #                              map_size=up_map_size, writemap=True, map_async=True, max_dbs=1)
        self.updates_env = lmdb.open('/data/lmdb_updates_' + self.build_model_str(),
                                     map_size=up_map_size, max_dbs=1)
        self.updates_index_db = self.updates_env.open_db("updates")
      elif self.lopq_searcher == "LOPQSearcher":
        from lopq.search import LOPQSearcher
        self.searcher = LOPQSearcher(lopq_model)
      else:
        raise ValueError("Unknown 'lopq_searcher' type: {}".format(self.lopq_searcher))
    # NB: an empty lopq_model would make sense only if we just want to detect...

  def get_feats_from_lmbd(self, feats_db, nb_features, dtype):
    """Get features from LMBD database

    :param feats_db: features database
    :type feats_db: str
    :param nb_features: number fo features
    :type nb_features: int
    :param dtype: numpy type
    :type dtype: :class:`numpy.dtype`
    :return: features
    :rtype: :class:`numpy.ndarray`
    """
    nb_saved_feats = self.get_nb_saved_feats(feats_db)
    nb_feats_to_read = min(nb_saved_feats, nb_features)
    feats = None
    if nb_feats_to_read > 0:
      with self.save_feat_env.begin(db=feats_db, write=False) as txn:
        with txn.cursor() as cursor:
          if cursor.first():
            first_item = cursor.item()
            first_feat = np.frombuffer(first_item[1], dtype=dtype)
            feats = np.zeros((nb_feats_to_read, first_feat.shape[0]))
            print("[get_feats_from_lmbd] Filling up features matrix: {}".format(feats.shape))
            sys.stdout.flush()
            for i, item in enumerate(cursor.iternext()):
              if i >= nb_feats_to_read:
                break
              feats[i, :] = np.frombuffer(item[1], dtype=dtype)
    return feats

  def save_feats_to_lmbd(self, feats_db, samples_ids, np_features, max_feats=0):
    """Save features to LMDB database

    :param feats_db: features database name
    :type feats_db: str
    :param samples_ids: samples ids
    :type samples_ids: list
    :param np_features: features
    :type np_features: :class:`numpy.ndarray`
    :param max_feats: maximum number of features to store in database
    :type max_feats: int
    :return: total number of features in database
    :rtype: int
    """
    with self.save_feat_env.begin(db=feats_db, write=True) as txn:
      for i, sid in enumerate(samples_ids):
        txn.put(bytes(sid), np_features[i, :].tobytes())
        nb_feats = txn.stat()['entries']
        if max_feats and nb_feats >= max_feats:
          return nb_feats
    return nb_feats

  def get_nb_saved_feats(self, feats_db):
    """Get number of features in LMBD database ``feats_db``

    :param feats_db: features database name
    :type feats_db: str
    :return: number of features
    :rtype: int
    """
    with self.save_feat_env.begin(db=feats_db, write=False) as txn:
      return txn.stat()['entries']

  def get_train_features(self, nb_features, lopq_pca_model=None, nb_min_train=None):
    """Get training features

    :param nb_features: number of features
    :type nb_features: int
    :param lopq_pca_model: whether model is lopq_pca_model
    :type lopq_pca_model: bool
    :param nb_min_train: minimum
    :type nb_min_train: int
    :return: features
    :rtype: :class:`numpy.ndarray`
    """
    if nb_min_train is None:
      nb_min_train = nb_features
    if lopq_pca_model:
      feats_db = self.save_feat_env.open_db("feats_pca")
      dtype = np.float32
    else:
      feats_db = self.save_feat_env.open_db("feats")
      from ..featurizer.featsio import get_feat_dtype
      dtype = get_feat_dtype(self.featurizer_type)
    nb_saved_feats = self.get_nb_saved_feats(feats_db)
    nb_features_to_read = nb_features

    seen_updates = set()

    if nb_saved_feats < nb_features:
      print("[{}: log] Gathering {} training samples...".format(self.pp, nb_features))
      sys.stdout.flush()
      start_date = "1970-01-01"
      done = False
      # Accumulate until we have enough features, or we have read all features if 'wait_for_nbtrain'
      # is false and we have at least nb_min_train
      while not done:
        for batch_updates in self.indexer.get_updates_from_date(start_date=start_date,
                                                                extr_type=self.build_extr_str()):
          # for updates in batch_updates:
          #  for update in updates:

          for update in batch_updates:
            try:
              # We could check if update has been processed, but if not we won't get features anyway
              update_id = update[0]
              #if column_list_sha1s in update[1]:
              if self.indexer.get_col_listsha1s() in update[1]:
                if update_id not in seen_updates:
                  sha1s = update[1][self.indexer.get_col_listsha1s()]
                  sids, features = self.indexer.get_features_from_sha1s(sha1s.split(','),
                                                                        self.build_extr_str())
                  if features:
                    # Apply PCA to features here to save memory
                    if lopq_pca_model:
                      np_features = lopq_pca_model.apply_PCA(np.asarray(features))
                    else:
                      np_features = np.asarray(features)
                    log_msg = "[{}: log] Got features {} from update {}"
                    print(log_msg.format(self.pp, np_features.shape, update_id))
                    sys.stdout.flush()
                    # just appending like this does not account for duplicates...
                    # train_features.extend(np_features)
                    nb_saved_feats = self.save_feats_to_lmbd(feats_db, sids, np_features)
                    seen_updates.add(update_id)
                  else:
                    if self.verbose > 3:
                      log_msg = "[{}: log] Did not get features from update {}"
                      print(log_msg.format(self.pp, update_id))
                      sys.stdout.flush()
                  if nb_saved_feats >= nb_features:
                    done = True
                    break
              else:
                warn_msg = "[{}: warning] Update {} has no list of images associated to it."
                print(warn_msg.format(self.pp, update_id))
                sys.stdout.flush()
            except Exception as inst:
              from cufacesearch.common.error import full_trace_error
              err_msg = "[{}: error] Failed to get features: {} {}"
              full_trace_error(err_msg.format(self.pp, type(inst), inst))
              sys.stdout.flush()
            else:
              if self.verbose > 4:
                print("[{}: log] Got {} training samples so far...".format(self.pp, nb_saved_feats))
                sys.stdout.flush()
            if done:
              nb_features_to_read = nb_saved_feats
              break
        else:
          if not done:
            # Wait for new updates...
            # TODO: could be optional
            if self.wait_for_nbtrain:
              if nb_saved_feats >= nb_min_train:
                log_msg = "[{}: log] Gathered minimum number of training features ({})..."
                print(log_msg.format(self.pp, nb_min_train))
                sys.stdout.flush()
                break
              else:
                log_msg = "[{}: log] Waiting for new updates. Got {} training samples so far..."
                print(log_msg.format(self.pp, nb_saved_feats))
                sys.stdout.flush()
                time.sleep(60)
            else:
              log_msg = "[{}: log] Gathered all available features ({})..."
              print(log_msg.format(self.pp, self.get_nb_saved_feats(feats_db)))
              sys.stdout.flush()
              break

    return self.get_feats_from_lmbd(feats_db, nb_features_to_read, dtype)

  def train_index(self):
    """Train search index

    :return: search index
    :rtype: LOPQModel, LOPQModelPCA
    """

    if self.model_type == "lopq":
      train_np = self.get_train_features(self.nb_train, nb_min_train=self.nb_min_train)
      print("Got train features array with shape: {}".format(train_np.shape))
      nb_train_feats = train_np.shape[0]
      sys.stdout.flush()

      if nb_train_feats >= self.nb_train:
        from lopq.model import LOPQModel
        # we could have default values for those parameters and/or heuristic to estimate them based on data count...
        lopq_model = LOPQModel(V=self.model_params['V'], M=self.model_params['M'],
                               subquantizer_clusters=self.model_params['subq'])
        # we could have separate training/indexing features
        msg = "[{}.train_model: info] Starting local training of 'lopq' model with parameters {} using {} features."
        print(msg.format(self.pp, self.model_params, nb_train_feats))
        start_train = time.time()
        # specify a n_init < 10 (default value) to speed-up training?
        lopq_model.fit(train_np, verbose=True)
        # save model
        self.storer.save(self.build_model_str(), lopq_model)
        msg = "[{}.train_model: info] Trained lopq model in {}s."
        print(msg.format(self.pp, time.time() - start_train))
        return lopq_model
      else:
        msg = "[{}.train_model: error] Could not train model, not enough training samples."
        print(msg.format(self.pp))

    elif self.model_type == "lopq_pca":
      # lopq_pca training.
      from lopq.model import LOPQModelPCA
      # we could have default values for those parameters
      # and/or heuristic to estimate them based on data count...
      lopq_model = LOPQModelPCA(V=self.model_params['V'], M=self.model_params['M'],
                                subquantizer_clusters=self.model_params['subq'], renorm=True)
      # pca loading/training first
      pca_model = self.storer.load(self.build_pca_model_str())
      if pca_model is None:
        train_np = self.get_train_features(self.nb_train_pca, nb_min_train=self.nb_min_train_pca)
        msg = "[{}.train_model: info] Training PCA model, keeping {} dimensions from features {}."
        print(msg.format(self.pp, self.model_params['pca'], train_np.shape))
        sys.stdout.flush()
        start_train_pca = time.time()
        lopq_model.fit_pca(train_np, pca_dims=self.model_params['pca'])
        info_msg = "[{}.train_model: info] Trained pca model in {}s."
        print(info_msg.format(self.pp, time.time() - start_train_pca))
        del train_np
        self.storer.save(self.build_pca_model_str(),
                         {"P": lopq_model.pca_P, "mu": lopq_model.pca_mu})
      else:
        lopq_model.pca_P = pca_model["P"]
        lopq_model.pca_mu = pca_model["mu"]
      # train model
      train_np = self.get_train_features(self.nb_train, lopq_pca_model=lopq_model,
                                         nb_min_train=self.nb_min_train)
      msg = "[{}.train_model: info] Training 'lopq_pca' model with parameters {} using features {}"
      print(msg.format(self.pp, self.model_params, train_np.shape))
      sys.stdout.flush()
      start_train = time.time()
      # specify a n_init < 10 (default value) to speed-up training?
      lopq_model.fit(train_np, verbose=True, apply_pca=False, train_pca=False)
      # TODO: we could evaluate model based on reconstruction of some randomly sampled features?
      # save model
      self.storer.save(self.build_model_str(), lopq_model)
      info_msg = "[{}.train_model: info] Trained lopq model in {}s."
      print(info_msg.format(self.pp, time.time() - start_train))
      sys.stdout.flush()
      return lopq_model
      # err_msg = "[{}.train_model: error] Local training of 'lopq_pca' model not yet implemented."
      # raise NotImplementedError(err_msg.format(self.pp))
    else:
      err_msg = "[{}.train_model: error] Unknown 'lopq' type {}."
      raise ValueError(err_msg.format(self.pp, self.model_type))
    # print train_features_path, os.path.exists(train_features_path), lopq_params

  # TODO: should we try to evaluate index by pushing train_features to a temporary searcher
  #    - compute exhaustive search for some randomly selected samples
  #    - analyze retrieval performance of approximate search?
  # technically we could even explore different configurations...

  def compute_codes(self, det_ids, data, codes_path=None):
    """Compute codes for features in ``data`` corresponding to samples ``det_ids``

    :param det_ids: samples ids
    :type det_ids: list
    :param data: features
    :type data: list(:class:`numpy.ndarray`)
    :param codes_path: path to use to save codes using storer
    :type codes_path: str
    :return: codes dictionary
    :rtype: dict
    """
    # Compute codes for each update batch and save them
    #from lopq.utils import compute_codes_parallel
    from lopq.utils import compute_codes_notparallel
    msg = "[{}.compute_codes: log] Computing codes for {} ({} unique) {}s from {} features"
    print(msg.format(self.pp, len(det_ids), len(set(det_ids)), self.input_type, len(data)))

    # That keeps the ordering intact, but output is a chain
    # Is this blocking now with gunicorn?
    #codes = compute_codes_parallel(data, self.searcher.model, self.num_procs)
    codes = compute_codes_notparallel(data, self.searcher.model)

    # Build dict output
    codes_dict = dict()
    count_codes = 0
    for i, code in enumerate(codes):
      count_codes += 1
      codes_dict[det_ids[i]] = [code.coarse, code.fine]

    if self.verbose > 3:
      msg = "[{}.compute_codes: log] Computed {} codes"
      print(msg.format(self.pp, len(codes_dict)))

    # Save
    if codes_path:
      # # Some old updates have duplicate sha1s...
      # if self.verbose > 1 and len(codes_dict) < len(det_ids):
      #   msg = "[{}.compute_codes: log] Saving only {} of {} codes. det_ids: {}"
      #   print(msg.format(self.pp, len(codes_dict), count_codes, det_ids))
      self.storer.save(codes_path, codes_dict)

    return codes_dict

  def add_update(self, update_id, date_db=None):
    """Add update id ``update_id`` to the database or list of update ids

    :param update_id: update id
    :type update_id: str
    :param date_db: datetime to save for that update
    :type date_db: datetime
    """
    if date_db is None:
      date_db = datetime.now()
    else:
      if self.verbose > 4:
        msg = "[{}.add_update: log] Saving update {} with date {}"
        print(msg.format(self.pp, update_id, date_db))
    if self.lopq_searcher == "LOPQSearcherLMDB":
      # Use another LMDB to store updates indexed
      with self.updates_env.begin(db=self.updates_index_db, write=True) as txn:
        txn.put(bytes(update_id), bytes(date_db))
    else:
      self.indexed_updates.add(update_id)
    if self.last_indexed_update is None or update_id > self.last_indexed_update:
      self.last_indexed_update = update_id

  def get_update_date_db(self, update_id):
    """Get update id ``update_id`` saved date in database

    :param update_id: update id
    :type update_id: str
    :raises TypeError: if self.lopq_searcher is not "LOPQSearcherLMDB"
    :raises ValueError: if ``update_id`` is not in database
    """
    if self.lopq_searcher == "LOPQSearcherLMDB":
      with self.updates_env.begin(db=self.updates_index_db, write=False) as txn:
        found_update = txn.get(bytes(update_id))
        if found_update:
          # parse found_update as string
          if self.verbose > 4:
            msg = "[{}.get_update_date_db: log] update {} date_db in database is: {}"
            print(msg.format(self.pp, update_id, found_update))
          return datetime.strptime(str(bytes(found_update)), DATE_DB_FORMAT)
        else:
          msg = "[{}.get_update_date_db: error] update {} is not in database"
          raise ValueError(msg.format(self.pp, update_id))
    else:
      msg = "[{}.get_update_date_db: error] lopq_searcher is not of type \"LOPQSearcherLMDB\""
      raise TypeError(msg)

  def skip_update(self, update_id, dtn):
    """Check if we should skip loading update ``update_id`` because it has been marked as fully
    processed and indexed already (using a date in the future)

    :param update_id: update id
    :type update_id: str
    :param dtn: datetime of now
    :type dtn: :class:`datetime.datetime`
    :return: whether this update should be skipped
    :rtype: bool
    """
    try:
      read_date_db = self.get_update_date_db(update_id)
      if self.verbose > 5:
        msg = "[{}: log] Check whether to skip update {} (date_db: {}, dtn: {})"
        print(msg.format(self.pp, update_id, read_date_db, dtn))
      if read_date_db.year > dtn.year:
        if self.verbose > 4:
          msg = "[{}: log] Skipping update {} marked with a future date: {}."
          print(msg.format(self.pp, update_id, read_date_db))
        return True
      return False
    except Exception as inst:
      if self.verbose > 1:
        print(inst)
      return False

  def is_update_indexed(self, update_id):
    """Check whether update ``update_id`` has already been indexed

    :param update_id: update id
    :type update_id: str
    :return: True (if indexed), False (if not)
    :rtype: bool
    """
    if self.lopq_searcher == "LOPQSearcherLMDB":
      #  mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot?
      with self.updates_env.begin(db=self.updates_index_db, write=False) as txn:
        found_update = txn.get(bytes(update_id))
        if found_update:
          return True
        else:
          return False
    else:
      return update_id in self.indexed_updates

  def is_update_processed(self, update_cols):
    """Check whether update columns ``update_cols`` contain the flag indicating that the update
    has been processed

    :param update_cols: update columns dictionary
    :type update_cols: dict
    :return: True (if processed), False (if not)
    :rtype: bool
    """
    if self.indexer.get_col_upproc() in update_cols:
      return True
    return False

  def get_latest_update_suffix(self):
    """Get latest update suffix

    :return: latest update suffix
    :rtype: str
    """
    if self.last_indexed_update is None:
      if self.lopq_searcher == "LOPQSearcherLMDB":
        # Try to get in from DB
        with self.updates_env.begin(db=self.updates_index_db, write=False) as txn:
          with txn.cursor() as cursor:
            if cursor.last():
              item = cursor.item()
              self.last_indexed_update = item[0]
              suffix = '_'.join(self.last_indexed_update.split('_')[6:])
            else:  # Would happen on empty db?
              suffix = "1970-01-01"
      else:
        suffix = "1970-01-01"
    else:
      suffix = '_'.join(self.last_indexed_update.split('_')[6:])
    return suffix

  def load_codes(self, full_refresh=False):
    """Load codes

    :param full_refresh: wheter to perform a full refresh or not
    :type full_refresh: bool
    """
    # For multi-workers setting with gunicorn
    self.set_pp(pp="SearcherLOPQHBase." + str(os.getpid()))

    # Calling this method can also perfom an update of the index
    if not self.searcher:
      info_msg = "[{}.load_codes: info] Not loading codes as searcher is not initialized."
      print(info_msg.format(self.pp))
      return

    start_load = time.time()
    total_compute_time = 0

    try:
      # try to get date of last update
      start_date = "1970-01-01"
      if not full_refresh:
        start_date = self.get_latest_update_suffix()
      extr_str = self.build_extr_str()
      feat_size = get_feat_size(self.featurizer_type)
      #feat_type = get_feat_dtype(self.featurizer_type)
      feat_type = self.featurizer_type

      # Get all updates ids for the extraction type
      # TODO: this scan makes the API unresponsive for ~2 minutes during the update process...
      for batch_updates in self.indexer.get_updates_from_date(start_date=start_date,
                                                              extr_type=extr_str):
        for update in batch_updates:
          update_id = update[0]
          # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot?
          if self.is_update_indexed(update_id) and not full_refresh:
            if self.verbose > 4:
              print("[{}: log] Skipping update {} already indexed.".format(self.pp, update_id))
              continue
          else:
            dtn = datetime.now()
            if self.is_update_processed(update[1]) and not self.skip_update(update_id, dtn):
              print("[{}: log] Looking for codes of update {}".format(self.pp, update_id))
              # Get this update codes
              codes_string = self.build_codes_string(update_id)
              try:
                # Check for precomputed codes
                codes_dict = self.storer.load(codes_string, silent=True)
                if codes_dict is None:
                  msg = "[{}: log] Could not load codes from {}"
                  raise ValueError(msg.format(self.pp, codes_string))
                # If full_refresh, check that we have as many codes as available features
                if full_refresh:
                  # Also check for 'completed' flag?
                  if self.indexer.get_col_listsha1s() in update[1]:
                    set_sha1s = set(update[1][self.indexer.get_col_listsha1s()].split(','))
                    sids, _ = self.indexer.get_features_from_sha1s(list(set_sha1s), extr_str)
                    if len(set(sids)) > len(codes_dict):
                      msg = "[{}: log] Update {} has {} new features"
                      diff_count = len(set(sids)) - len(codes_dict)
                      raise ValueError(msg.format(self.pp, update_id, diff_count))
                    else:
                      msg = "[{}: log] Skipping update {} indexed with all {}/{} features"
                      print(msg.format(self.pp, update_id, len(codes_dict), len(set(sids))))
                      miss_extr = self.indexer.get_missing_extr_sha1s(list(set_sha1s), extr_str,
                                                                      skip_failed=self.skipfailed)
                      # If all sha1s have been processed, no need to ever check that update again
                      # Store that information as future date_db to avoid ever checking again...
                      if not miss_extr and self.lopq_searcher == "LOPQSearcherLMDB":
                        dtn = dtn.replace(year=9999)
              except Exception as inst:
                # Update codes not available
                if self.verbose > 3:
                  print(inst)
                # Compute codes for update not yet processed and save them
                start_compute = time.time()
                # Get detections (if any) and features
                if self.indexer.get_col_listsha1s() in update[1]:
                  list_sha1s = list(set(update[1][self.indexer.get_col_listsha1s()].split(',')))
                  sids, fts = self.indexer.get_features_from_sha1s(list_sha1s, extr_str, feat_type)
                  if fts:
                    if fts[0].shape[-1] != feat_size:
                      msg = "[{}.load_codes: error] Invalid feature size {} vs {} expected"
                      raise ValueError(msg.format(fts[0].shape[-1], feat_size))
                    codes_dict = self.compute_codes(sids, fts, codes_string)
                    update_compute_time = time.time() - start_compute
                    total_compute_time += update_compute_time
                    if self.verbose > 0:
                      log_msg = "[{}: log] Update {} codes computation done in {}s"
                      print(log_msg.format(self.pp, update_id, update_compute_time))
                  else:
                    print("[{}: warning] Update {} has no features.".format(self.pp, update_id))
                    continue
                else:
                  print("[{}: warning] Update {} has no list of images.".format(self.pp, update_id))
                  continue

              # Use new method add_codes_from_dict of searcher
              self.searcher.add_codes_from_dict(codes_dict)
              self.add_update(update_id, date_db=dtn)

      total_load = time.time() - start_load
      self.last_refresh = datetime.now()

      print("[{}: log] Total udpates computation time is: {}s".format(self.pp, total_compute_time))
      print("[{}: log] Total udpates loading time is: {}s".format(self.pp, total_load))
      # Total udpates loading time is: 0.0346581935883s, really? Seems much longer

    except Exception as inst:
      full_trace_error("[{}: error] Could not load codes. {}".format(self.pp, inst))
      #load_codesprint("[{}: error] Could not load codes. {}".format(self.pp, inst))

  # def load_all_codes(self):
  #   # load self.indexed_updates, self.searcher.index and self.searcher.nb_indexed
  #   # NOT for LOPQSearcherLMDB
  #   pass
  #
  # def save_all_codes(self):
  #   # we should save self.indexed_updates, self.searcher.index and self.searcher.nb_indexed
  #   # self.searcher.index could be big, how to save without memory issue...
  #   # NOT for LOPQSearcherLMDB
  #   pass

  def search_from_feats(self, dets, feats, options_dict=dict()):
    """Search the index using features ``feats`` of samples ``dets``

    :param dets: list of query samples, images or list of detections in each image
    :type dets: list
    :param feats: list of features
    :type feats: list
    :param options_dict: options dictionary
    :type options_dict: dict
    :return: formatted output
    :rtype: collections.OrderedDict
    """
    # NB: dets is a list of list
    import time
    # For multi-workers setting with gunicorn
    self.set_pp(pp="SearcherLOPQHBase." + str(os.getpid()))

    start_search = time.time()
    extr_str = self.build_extr_str()
    #feat_size = get_feat_size(self.featurizer_type)
    #feat_type = get_feat_dtype(self.featurizer_type)
    feat_type = self.featurizer_type

    all_sim_images = []
    all_sim_dets = []
    all_sim_score = []

    # check what is the near duplicate config
    filter_near_dup = False
    if (self.near_dup and "near_dup" not in options_dict) or (
            "near_dup" in options_dict and options_dict["near_dup"]):
      filter_near_dup = True
      if "near_dup_th" in options_dict:
        near_dup_th = options_dict["near_dup_th"]
      else:
        near_dup_th = self.near_dup_th

    max_returned = self.sim_limit
    if "max_returned" in options_dict:
      max_returned = options_dict["max_returned"]
    # this should be set with a parameter either in conf or options_dict too.
    # should we use self.quota here? and potentially overwrite from options_dict
    quota = min(1000 * max_returned, 10000)

    # print dets
    if self.detector is not None:
      # query for each feature
      for i in range(len(dets)):

        sim_images = []
        sim_dets = []
        sim_score = []

        for j in range(len(dets[i][1])):
          results = []
          if "detect_only" not in options_dict or not options_dict["detect_only"]:
            if self.searcher:
              # Normalize feature first as it is how it is done during extraction...
              norm_feat = np.linalg.norm(feats[i][j])
              normed_feat = np.squeeze(feats[i][j] / norm_feat)
              results, visited = self.searcher.search(normed_feat, quota=quota, limit=max_returned,
                                                      with_dists=True)
              msg = "[{}.search_from_feats: log] Got {} results by visiting {} cells, first is: {}"
              print(msg.format(self.pp, len(results), visited, results[0]))

          # If reranking, get features from hbase for detections using res.id
          #   we could also already get 's3_url' to avoid a second call to HBase later...
          if self.reranking:
            try:
              res_list_sha1s = [str(x.id).split('_')[0] for x in results]
              res_sids, res_fts = self.indexer.get_features_from_sha1s(res_list_sha1s, extr_str,
                                                                       feat_type)
              # # FIXME: dirty fix for dlib features size issue.
              # # To be removed once workflow applied on all legacy data
              # if res_fts is not None and res_fts[0].shape[-1] < 128:
              #   res_sids, res_fts = self.indexer.get_features_from_sha1s(res_list_sha1s,
              #                                                                        self.build_extr_str(),
              #                                                                        "float32")
              #   if res_fts:
              #     forced_msg = "Forced decoding of features as float32. Got {} samples, features with shape {}"
              #     print(forced_msg.format(len(res_sids), res_fts[0].shape))
            except Exception as inst:
              err_msg = "[{}: error] Could not retrieve features for re-ranking. {}"
              print(err_msg.format(self.pp, inst))

          tmp_img_sim = []
          tmp_dets_sim_ids = []
          tmp_dets_sim_score = []
          for ires, res in enumerate(results):
            dist = res.dist
            # if reranking compute actual distance
            if self.reranking:
              try:
                pos = res_sids.index(res.id)
                dist = np.linalg.norm(normed_feat - res_fts[pos])
                # print "[{}: res_fts[{}] approx. dist: {}, rerank dist: {}".format(res.id, pos, res.dist, dist)
              except Exception as inst:
                # Means feature was not saved to backend index...
                err_msg = "Could not compute reranking distance for sample {}, error {} {}"
                print(err_msg.format(res.id, type(inst), inst))
            if (filter_near_dup and dist <= near_dup_th) or not filter_near_dup:
              if not max_returned or (max_returned and ires < max_returned):
                tmp_dets_sim_ids.append(res.id)
                # here id would be face_id that we could build as sha1_facebbox?
                tmp_img_sim.append(str(res.id).split('_')[0])
                tmp_dets_sim_score.append(dist)

          # If reranking, we need to reorder
          if self.reranking:
            sids = np.argsort(tmp_dets_sim_score, axis=0)
            rerank_img_sim = []
            rerank_dets_sim_ids = []
            rerank_dets_sim_score = []
            for si in sids:
              rerank_img_sim.append(tmp_img_sim[si])
              rerank_dets_sim_ids.append(tmp_dets_sim_ids[si])
              rerank_dets_sim_score.append(tmp_dets_sim_score[si])
            tmp_img_sim = rerank_img_sim
            tmp_dets_sim_ids = rerank_dets_sim_ids
            tmp_dets_sim_score = rerank_dets_sim_score

          # print tmp_img_sim
          if tmp_img_sim:
            rows = []
            try:
              rows = self.indexer.get_columns_from_sha1_rows(tmp_img_sim, self.needed_output_columns)
            except Exception as inst:
              err_msg = "[{}: error] Could not retrieve similar images info from indexer. {}"
              print(err_msg.format(self.pp, inst))
            # rows should contain id, s3_url of images
            # print rows
            if not rows:
              sim_images.append([(x,) for x in tmp_img_sim])
            elif len(rows) < len(tmp_img_sim) or not rows:
              # fall back to just sha1s... but beware to keep order...
              dec = 0
              fixed_rows = []
              for pos, sha1 in tmp_img_sim:
                if rows[pos - dec][0] == sha1:
                  fixed_rows.append(rows[pos - dec])
                else:
                  dec += 1
                  fixed_rows.append((sha1,))
              sim_images.append(fixed_rows)
            else:
              sim_images.append(rows)
            sim_dets.append(tmp_dets_sim_ids)
            sim_score.append(tmp_dets_sim_score)
          else:
            sim_images.append([])
            sim_dets.append([])
            sim_score.append([])

        all_sim_images.append(sim_images)
        all_sim_dets.append(sim_dets)
        all_sim_score.append(sim_score)
    else:
      # No detection
      results = []
      sim_images = []
      sim_score = []

      for i in range(len(feats)):
        if self.searcher:
          # Normalize feature first as it is how it is done during extraction...
          norm_feat = np.linalg.norm(feats[i])
          normed_feat = np.squeeze(feats[i] / norm_feat)
          results, visited = self.searcher.search(normed_feat, quota=quota, limit=max_returned,
                                                  with_dists=True)
          res_msg = "[{}.search_from_feats: log] Got {} results by visiting {} cells, first is: {}"
          print(res_msg.format(self.pp, len(results), visited, results[0]))

        # Reranking, get features from hbase for detections using res.id
        if self.reranking:
          try:
            res_list_sha1s = [str(x.id) for x in results]
            res_sids, res_fts = self.indexer.get_features_from_sha1s(res_list_sha1s, extr_str,
                                                                     feat_type)
          except Exception as inst:
            err_msg = "[{}: error] Could not retrieve features for re-ranking. {}"
            print(err_msg.format(self.pp, inst))

        tmp_img_sim = []
        tmp_sim_score = []
        for ires, res in enumerate(results):
          dist = res.dist
          if self.reranking:
            # If reranking compute actual distance
            try:
              pos = res_sids.index(res.id)
              dist = np.linalg.norm(normed_feat - res_fts[pos])
              # print "[{}: res_fts[{}] approx. dist: {}, rerank dist: {}".format(res.id, pos, res.dist, dist)
            except Exception as inst:
              err_msg = "Could not compute reranked distance for sample {}, error {} {}"
              print(err_msg.format(res.id, type(inst), inst))
          if (filter_near_dup and dist <= near_dup_th) or not filter_near_dup:
            if not max_returned or (max_returned and ires < max_returned):
              tmp_img_sim.append(str(res.id))
              tmp_sim_score.append(dist)

        # If reranking, we need to reorder
        if self.reranking:
          sids = np.argsort(tmp_sim_score, axis=0)
          rerank_img_sim = []
          rerank_sim_score = []
          for si in sids:
            rerank_img_sim.append(tmp_img_sim[si])
            rerank_sim_score.append(tmp_sim_score[si])
          tmp_img_sim = rerank_img_sim
          tmp_sim_score = rerank_sim_score

        if tmp_img_sim:
          rows = []
          try:
            rows = self.indexer.get_columns_from_sha1_rows(tmp_img_sim, self.needed_output_columns)
          except Exception as inst:
            err_msg = "[{}: error] Could not retrieve similar images info from indexer. {}"
            print(err_msg.format(self.pp, inst))
          # rows should contain id, s3_url of images
          # print rows
          sim_images.append(rows)
          sim_score.append(tmp_sim_score)
        else:
          sim_images.append([])
          sim_score.append([])

      all_sim_images.append(sim_images)
      all_sim_dets.append([])
      all_sim_score.append(sim_score)

    search_time = time.time() - start_search
    print("[{}: log] Search performed in {:0.3}s.".format(self.pp, search_time))

    # format output
    # print "all_sim_images",all_sim_images
    # print "all_sim_dets",all_sim_dets
    # print "all_sim_score",all_sim_score
    return self.do.format_output(dets, all_sim_images, all_sim_dets, all_sim_score, options_dict,
                                 self.input_type)
Exemplo n.º 7
0
def test_oxford5k():

    random_state = 40
    data = load_oxford_data()
    train, test = train_test_split(data,
                                   test_size=0.2,
                                   random_state=random_state)

    # Compute distance-sorted neighbors in training set for each point in test set
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1, random_state=random_state)

    # Assert correct code computation
    assert_equal(m.predict(test[0]),
                 ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250)))

    # Assert low number of empty cells
    h = get_cell_histogram(train, m)
    assert_equal(np.count_nonzero(h == 0), 6)

    # Assert true NN recall on test set
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with just coarse quantizers
    m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with coarse quantizers and rotations
    m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
class SearcherLOPQHBase():
    def __init__(self, global_conf_filename):
        self.global_conf_filename = global_conf_filename
        self.global_conf = json.load(open(global_conf_filename, 'rt'))
        self.read_conf()
        self.init_lopq()
        self.init_hbaseindexer()
        self.init_feature_extractor()
        self.load_codes()
        self.url_field = 'info:s3_url'
        self.needed_output_columns = [self.url_field]

    def read_conf(self):
        # these parameters may be overwritten by web call
        self.sim_limit = self.global_conf['SE_sim_limit']
        self.quota = self.sim_limit * 10
        self.near_dup = self.global_conf['SE_near_dup']
        self.near_dup_th = self.global_conf['SE_near_dup_th']
        self.ratio = self.global_conf['SE_ratio']
        self.topfeature = 0
        if "SE_topfeature" in self.global_conf:
            self.topfeature = int(self.global_conf['SE_topfeature'])
        self.out_dir = ""
        if "SE_outdir" in self.global_conf:
            self.out_dir = self.global_conf['SE_outdir']
            from ..memex_tools.image_dl import mkpath
            mkpath(self.out_dir)

    def init_lopq(self):
        """ Initialize LOPQ model and searcher from `global_conf` value.
        """
        field = 'SE_lopq'
        if field not in self.global_conf:
            raise ValueError("[Searcher: error] " + field +
                             " is not defined in configuration file.")
        elif self.global_conf[field] == "lopq_pca":
            from lopq.model import LOPQModelPCA
            from lopq.search import LOPQSearcher
            import pickle
            # actually load pickle from disk
            lopq_model_path = self.global_conf['SE_lopqmodel']
            if lopq_model_path.startswith(START_HDFS):
                # deal with HDFS path
                from lopq.utils import copy_from_hdfs
                import shutil
                filename = copy_from_hdfs(lopq_model_path)
                lopq_model = pickle.load(filename)
                try:
                    shutil.rmtree(os.path.dirname(filename))
                except Exception as inst:
                    pass
            else:
                # local path in config
                lopq_model = pickle.load(open(lopq_model_path, "rb"))
            self.searcher_lopq = LOPQSearcher(lopq_model)
        else:
            raise ValueError(
                "[SearcherLOPQHBase: error] unkown 'lopq' type {}.".format(
                    self.global_conf[field]))

    def init_hbaseindexer(self):
        """ Initialize HbBase Indexer from `global_conf` value.
        """
        field = 'SE_indexer'
        if field not in self.global_conf:
            raise ValueError("[SearcherLOPQHBase: error] " + field +
                             " is not defined in configuration file.")
        elif self.global_conf[field] == "hbase_indexer_minimal":
            from ..indexer.hbase_indexer_minimal import HBaseIndexerMinimal
            self.indexer = HBaseIndexerMinimal(self.global_conf_filename)
        else:
            raise ValueError(
                "[SearcherLOPQHBase: error] unkown 'indexer' {}.".format(
                    self.global_conf[field]))

    def init_feature_extractor(self):
        """ Initialize Feature Extractor from `global_conf` value.
        """
        field = 'SE_feature_extractor'
        if field not in self.global_conf:
            raise ValueError("[SearcherLOPQHBase: error] " + field +
                             " is not defined in configuration file.")
        elif self.global_conf[field] == "sentibank_tensorflow":
            from ..feature_extractor.sentibank_tensorflow import SentiBankTensorflow
            self.feature_extractor = SentiBankTensorflow(
                self.global_conf_filename)
        else:
            raise ValueError(
                "[SearcherLOPQHBase: error] unkown 'feature_extractor' {}.".
                format(self.global_conf[field]))

    def load_codes(self):
        codes_path = self.global_conf['SE_codes_path']
        if codes_path.startswith(START_HDFS):
            self.searcher_lopq.add_codes_from_hdfs(codes_path)
        else:
            self.searcher_lopq.add_codes_from_local(codes_path)

    def check_ratio(self):
        '''Check if we need to set the ratio based on topfeature.'''
        if self.topfeature > 0:
            self.ratio = self.topfeature * 1.0 / len(
                self.indexer.sha1_featid_mapping)
            print "[Searcher.check_ratio: log] Set ratio to {} as we want top {} images out of {} indexed.".format(
                self.ratio, self.topfeature,
                len(self.indexer.sha1_featid_mapping))

    def build_output(self,
                     nb_query,
                     corrupted,
                     list_sha1_id,
                     sim,
                     sim_score,
                     options_dict=dict()):

        dec = 0
        output = []
        do = DictOutput()

        for i in range(0, nb_query):
            output.append(dict())
            if i in corrupted:
                output[i][do.map['similar_images']] = OrderedDict([[do.map['number'],0],\
                                                           [do.map['sha1'],[]],\
                                                           [do.map['cached_image_urls'],[]],\
                                                           [do.map['distance'],[]]])
                dec += 1
                continue
            ii = i - dec
            output[i][do.map['similar_images']] = OrderedDict([[do.map['number'],len(sim[ii])],\
                                                               [do.map['sha1'],[]],\
                                                               [do.map['cached_image_urls'],[]],\
                                                               [do.map['distance'],[]]])
            output[i][do.map['query_sha1']] = list_sha1_id[ii]
            ok_sims = []
            for jj, simj in enumerate(sim[ii]):
                found_columns = [
                    c in simj[1] for c in self.needed_output_columns
                ]
                if found_columns.count(True) == len(
                        self.needed_output_columns):
                    output[i][do.map['similar_images']][do.map['sha1']].append(
                        simj[0].strip())
                    output[i][do.map['similar_images']][
                        do.map['cached_image_urls']].append(
                            simj[1][self.url_field].strip())
                    ok_sims.append(jj)
            output[i][do.map['similar_images']][do.map['distance']] = [
                sim_score[ii][jj] for jj in ok_sims
            ]
        outp = OrderedDict([[do.map['number'], nb_query],
                            [do.map['images'], output]])
        return outp

    def build_error_output(self, nb_query, inst):
        errors = dict()
        errors[
            'search'] = "[format_output ERROR] could not prepare output. Error was: {}".format(
                inst)
        output = []
        do = DictOutput()
        outp = OrderedDict([[do.map['number'], nb_query],
                            [do.map['images'], output], ['errors', errors]])
        return outp

    def format_output(self,
                      sim,
                      sim_score,
                      nb_query,
                      corrupted,
                      list_sha1_id,
                      options_dict=dict()):
        # read lopq similarity results and get 'cached_image_urls', 'ads_cdr_ids'
        # and filter out if near_dup is activated
        print "[SearcherLOPQHBase.format_output: log] options are: {}".format(
            options_dict)
        start_build_output = time.time()
        outp = self.build_output(nb_query, corrupted, list_sha1_id, sim,
                                 sim_score, options_dict)
        print "[SearcherLOPQHBase.format_output: log] build_output took: {}".format(
            time.time() - start_build_output)
        #print "[Searcher.format_output: log] output {}".format(output)
        return outp

    def search_image_list(self, image_list, options_dict=dict()):
        # initilization
        search_id = str(time.time())
        # get sha1s and feats for each URL in image_list
        list_sha1_id = []
        feats = []
        tmp_sha1s_feats = self.feature_extractor.compute_sha1_features_fromURLs_nodiskout(
            image_list)
        for sample in tmp_sha1s_feats:
            list_sha1_id.append(sample[0])
            feats.append(sample[1][0])
        return self.search_from_feats(feats, list_sha1_id, options_dict)

    def search_imageB64_list(self, imageB64_list, options_dict=dict()):
        # initilization
        search_id = str(time.time())
        # get sha1s and feats for each URL in image_list
        list_sha1_id = []
        feats = []
        tmp_sha1s_feats = self.feature_extractor.compute_sha1_features_fromB64s_nodiskout(
            imageB64_list)
        for sample in tmp_sha1s_feats:
            list_sha1_id.append(sample[0])
            feats.append(sample[1][0])
        return self.search_from_feats(feats, list_sha1_id, options_dict)

    def search_from_feats(self, feats, list_sha1_id, options_dict=dict()):
        # format of results is a list of namedtuples as: namedtuple('Result', ['id', 'code', 'dist'])
        # this does not support batch query
        sim = []
        sim_score = []
        # check what is the near duplicate config
        filter_near_dup = False
        if (self.near_dup and "near_dup" not in options_dict) or (
                "near_dup" in options_dict and options_dict["near_dup"]):
            filter_near_dup = True
            if "near_dup_th" in options_dict:
                near_dup_th = options_dict["near_dup_th"]
            else:
                near_dup_th = self.near_dup_th
        # query for each feature
        for i in range(len(list_sha1_id)):
            norm_feat = np.linalg.norm(feats[i])
            pca_projected_feat = np.squeeze(
                self.searcher_lopq.model.apply_PCA(feats[i] / norm_feat))
            #print "[SearcherLOPQHBase.search_from_feats: log] pca_projected_feat.shape: {}".format(pca_projected_feat.shape)
            results, visited = self.searcher_lopq.search(pca_projected_feat,
                                                         quota=self.quota,
                                                         limit=self.sim_limit,
                                                         with_dists=True)
            print "[SearcherLOPQHBase.search_from_feats: log] got {} results, first one is: {}".format(
                len(results), results[0])
            tmp_sim = []
            tmp_sim_score = []
            for res in results:
                if (filter_near_dup
                        and res.dist <= near_dup_th) or not filter_near_dup:
                    tmp_sim.append(res.id)
                    tmp_sim_score.append(res.dist)
            if tmp_sim:
                rows = self.indexer.get_columns_from_sha1_rows(
                    tmp_sim, self.needed_output_columns)
                # rows contain id and s3_url of all images
                sim.append(rows)
            else:
                sim.append([])
            sim_score.append(tmp_sim_score)
        # format output
        return self.format_output(sim, sim_score, len(list_sha1_id), [],
                                  list_sha1_id, options_dict)