Пример #1
0
class LOPQ(BaseANN):
    def __init__(self, v):
        m = 4
        self.name = 'LOPQ(v={}, m={})'.format(v, m)
        self._m = m
        self._model = LOPQModel(V=v, M=m)
        self._searcher = None
        print("Init done")

    def fit(self, X):
        X = numpy.array(X)
        X = X.astype(numpy.float32)
        self._model.fit(X)
        self._searcher = LOPQSearcher(self._model)
        self._searcher.add_data(X)
        print("Fit done")

    def query(self, v, n):
        v = v.astype(numpy.float32)
        print(v)
        print(n)
        print("-----------------------------------")
        nns = searcher.search(x, quota=100)
        return nns

    def use_threads(self):
        return True
Пример #2
0
class ApproximateIndexer(object):

    def __init__(self,index_name,model_path,lmdb_path,V=16, M=16):
        self.model = LOPQModel(V,M)
        self.index_name = index_name
        self.searcher = None
        self.model_path = model_path
        self.lmdb_path = lmdb_path

    def load(self):
        self.model.load_proto(self.model_path)

    def fit(self,train):
        print train.shape
        self.pca_reduction = PCA(n_components=256)
        self.pca_reduction.fit(train)
        train = self.pca_reduction.transform(train)
        self.P, self.mu = pca(train)
        train = np.dot(train, self.P)
        print train.shape
        self.model.fit(train, n_init=1)

    def transform(self,test):
        print test.shape
        test = self.pca_reduction.transform(test)
        test = test - self.mu
        test = np.dot(test,self.P)
        print test.shape
        return test

    def fit_model(self,train):
        self.fit(train)
        self.model.export_proto(self.model_path)
        self.searcher = LOPQSearcher(self.model) # LOPQSearcherLMDB(self.model,self.lmdb_path)

    def experiment(self,data):
        train, test = train_test_split(data, test_size=0.1)
        print data.shape,train.shape,test.shape
        nns = compute_all_neighbors(test, train)
        self.fit_model(train)
        self.searcher.add_data(self.transform(train))
        recall, _ = get_recall(self.searcher, self.transform(test), nns)
        print 'Recall (V={}, M={}, subquants={}): {}'.format(self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))

    def add_data(self,data):
        self.searcher.add_data(data)

    def search(self,x):
        return self.searcher.search(x,quota=100)
Пример #3
0
class LOPQ(BaseANN):
    def __init__(self, v):
        m = 4
        self.name = 'LOPQ(v={}, m={})'.format(v, m)
        self._m = m
        self._model = LOPQModel(V=v, M=m)
        self._searcher = None

    def fit(self, X):
        X = numpy.array(X)
        X = X.astype(numpy.float32)
        self._model.fit(X)
        self._searcher = LOPQSearcher(self._model)
        self._searcher.add_data(X)

    def query(self, v, n):
        v = v.astype(numpy.float32)
        nns = self._searcher.search(v, quota=100)
        return nns
Пример #4
0
def main(input_dir='/Users/aub3/temptest/gtin/',
         output_dir="/Users/aub3/temptest/products"):
    products = external_indexed.ProductsIndex(path=output_dir)
    # products.prepare(input_dir)
    products.build_approximate()
    data = products.data
    # data = load_oxford_data()
    print data.shape
    pca_reduction = PCA(n_components=32)
    pca_reduction.fit(data)
    data = pca_reduction.transform(data)
    print data.shape
    P, mu = pca(data)
    data = data - mu
    data = np.dot(data, P)
    train, test = train_test_split(data, test_size=0.2)
    print train.shape, test.shape
    nns = compute_all_neighbors(test, train)
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1)
    print "fitted"
    searcher = LOPQSearcher(m)
    print "adding data"
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
        m.V, m.M, m.subquantizer_clusters, str(recall))
    m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1)
    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
        m2.V, m2.M, m2.subquantizer_clusters, str(recall))
    m3 = LOPQModel(V=16,
                   M=8,
                   subquantizer_clusters=512,
                   parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1)
    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
        m3.V, m3.M, m3.subquantizer_clusters, str(recall))
Пример #5
0
 def cluster(self):
     print self.data.shape
     pca_reduction = PCA(n_components=32)
     pca_reduction.fit(self.data)
     self.data = pca_reduction.transform(self.data)
     print self.data.shape
     P, mu = self.pca()
     self.data = self.data - mu
     data = np.dot(self.data, P)
     train, test = train_test_split(self.data, test_size=0.2)
     print train.shape, test.shape
     nns = compute_all_neighbors(test, train)
     m = LOPQModel(V=16, M=8)
     m.fit(train, n_init=1)
     print "fitted"
     searcher = LOPQSearcher(m)
     print "adding data"
     searcher.add_data(train)
     recall, _ = get_recall(searcher, test, nns)
     print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
         m.V, m.M, m.subquantizer_clusters, str(recall))
     m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None))
     m2.fit(train, n_init=1)
     searcher = LOPQSearcher(m2)
     searcher.add_data(train)
     recall, _ = get_recall(searcher, test, nns)
     print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
         m2.V, m2.M, m2.subquantizer_clusters, str(recall))
     m3 = LOPQModel(V=16,
                    M=8,
                    subquantizer_clusters=512,
                    parameters=(m.Cs, m.Rs, m.mus, None))
     m3.fit(train, n_init=1)
     searcher = LOPQSearcher(m3)
     searcher.add_data(train)
     recall, _ = get_recall(searcher, test, nns)
     print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
         m3.V, m3.M, m3.subquantizer_clusters, str(recall))
Пример #6
0
class Clustering(object):
    def __init__(self,
                 fnames,
                 n_components,
                 model_proto_filename,
                 m,
                 v,
                 sub,
                 test_mode=False,
                 dc=None):
        """
        Simplify this mess haivng a seperate create vs load/init
        """
        data = []
        self.dc = dc
        self.fnames = fnames
        self.entries = []
        for fname in fnames:
            nmat = np.load(fname)
            if nmat.ndim > 2:
                nmat = nmat.squeeze()
            data.append(nmat)
            for e in json.load(file(fname.replace('npy', 'json'))):
                self.entries.append(e)
        if data:
            if len(data) > 1:
                self.data = np.concatenate(data)
            else:
                self.data = data[0]
            logging.info(self.data.shape)
        self.test_mode = test_mode
        self.n_components = n_components
        self.m = m
        self.v = v
        self.sub = sub
        self.model = None
        self.searcher = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.model_proto_filename = model_proto_filename
        self.P_filename = model_proto_filename.replace('.proto', '.P.npy')
        self.mu_filename = model_proto_filename.replace('.proto', '.mu.npy')
        self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl')
        self.model_lmdb_filename = model_proto_filename.replace(
            '.proto', '_lmdb')
        self.permuted_inds_filename = model_proto_filename.replace(
            '.proto', '.permuted_inds.pkl')
        self.permuted_inds = None

    def pca(self):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        subvectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = self.data.shape
        mu = self.data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data,
                              np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def cluster(self):
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(self.data)
        self.data = self.pca_reduction.transform(self.data)
        self.P, self.mu = self.pca()
        self.data = self.data - self.mu
        self.data = np.dot(self.data, self.P)
        train, test = train_test_split(self.data, test_size=0.2)
        self.model = LOPQModel(V=self.v,
                               M=self.m,
                               subquantizer_clusters=self.sub)
        self.model.fit(train, n_init=1)
        for i, e in enumerate(
                self.entries):  # avoid doing this twice again in searcher
            r = self.model.predict(self.data[i])
            e['coarse'] = r.coarse
            e['fine'] = r.fine
            e['index'] = i
        self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
        if self.test_mode:
            self.searcher.add_data(train)
            nns = compute_all_neighbors(test, train)
            recall, _ = get_recall(self.searcher, test, nns)
            print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
                self.model.V, self.model.M, self.model.subquantizer_clusters,
                str(recall))
        else:
            self.searcher.add_data(self.data)

    def find(self):
        i, selected = random.choice([k for k in enumerate(self.entries)])
        print selected
        for k in self.searcher.get_result_quota(self.data[i], 10):
            print k

    def save(self):
        self.model.export_proto(self.model_proto_filename)
        with open(self.pca_filename, 'w') as out:
            pickle.dump(self.pca_reduction, out)
        with open(self.P_filename, 'w') as out:
            np.save(out, self.P)
        with open(self.mu_filename, 'w') as out:
            np.save(out, self.mu)
        with open(self.permuted_inds_filename, 'w') as out:
            pickle.dump(self.permuted_inds, out)
        self.searcher.env.close()

    def load(self):
        self.model = LOPQModel.load_proto(self.model_proto_filename)
        self.pca_reduction = pickle.load(file(self.pca_filename))
        self.P = np.load(file(self.P_filename))
        self.mu = np.load(file(self.mu_filename))
        self.permuted_inds = np.load(file(self.permuted_inds_filename))
        self.searcher = LOPQSearcherLMDB(model=self.model,
                                         lmdb_path=self.model_lmdb_filename)

    def apply(self, vector, count=None):
        vector = np.dot((self.pca_reduction.transform(vector) - self.mu),
                        self.P).transpose().squeeze()
        codes = self.model.predict(vector)
        if count:
            results = self.searcher.search(vector, quota=count)
        else:
            results = None
        return codes.coarse, codes.fine, results
Пример #7
0
class Clustering(object):

    def __init__(self,fnames,n_components,model_proto_filename,m,v,sub,test_mode=False,dc=None):
        """
        Simplify this mess haivng a seperate create vs load/init
        """
        data = []
        self.dc = dc
        self.fnames = fnames
        self.entries = []
        for fname in fnames:
            nmat = np.load(fname)
            if nmat.ndim > 2:
                nmat = nmat.squeeze()
            data.append(nmat)
            for e in json.load(file(fname.replace('npy','json'))):
                self.entries.append(e)
        if data:
            if len(data) > 1:
                self.data = np.concatenate(data)
            else:
                self.data = data[0]
            logging.info(self.data.shape)
        self.test_mode = test_mode
        self.n_components = n_components
        self.m = m
        self.v = v
        self.sub = sub
        self.model = None
        self.searcher = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.model_proto_filename = model_proto_filename
        self.P_filename = model_proto_filename.replace('.proto','.P.npy')
        self.mu_filename = model_proto_filename.replace('.proto','.mu.npy')
        self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl')
        self.model_lmdb_filename = model_proto_filename.replace('.proto', '_lmdb')
        self.permuted_inds_filename = model_proto_filename.replace('.proto', '.permuted_inds.pkl')
        self.permuted_inds = None

    def pca(self):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        subvectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = self.data.shape
        mu = self.data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def cluster(self):
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(self.data)
        self.data = self.pca_reduction.transform(self.data)
        self.P, self.mu = self.pca()
        self.data = self.data - self.mu
        self.data = np.dot(self.data,self.P)
        train, test = train_test_split(self.data, test_size=0.2)
        self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub)
        self.model.fit(train, n_init=1)
        for i,e in enumerate(self.entries): # avoid doing this twice again in searcher
            r = self.model.predict(self.data[i])
            e['coarse'] = r.coarse
            e['fine'] = r.fine
            e['index'] = i
        self.searcher = LOPQSearcherLMDB(self.model,self.model_lmdb_filename)
        if self.test_mode:
            self.searcher.add_data(train)
            nns = compute_all_neighbors(test, train)
            recall, _ = get_recall(self.searcher, test, nns)
            print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))
        else:
            self.searcher.add_data(self.data)

    def find(self):
        i,selected = random.choice([k for k in enumerate(self.entries)])
        print selected
        for k in self.searcher.get_result_quota(self.data[i],10):
            print k

    def save(self):
        self.model.export_proto(self.model_proto_filename)
        with open(self.pca_filename,'w') as out:
            pickle.dump(self.pca_reduction,out)
        with open(self.P_filename, 'w') as out:
            np.save(out,self.P)
        with open(self.mu_filename, 'w') as out:
            np.save(out,self.mu)
        with open(self.permuted_inds_filename, 'w') as out:
            pickle.dump(self.permuted_inds,out)
        self.searcher.env.close()

    def load(self):
        self.model = LOPQModel.load_proto(self.model_proto_filename)
        self.pca_reduction = pickle.load(file(self.pca_filename))
        self.P = np.load(file(self.P_filename))
        self.mu = np.load(file(self.mu_filename))
        self.permuted_inds = np.load(file(self.permuted_inds_filename))
        self.searcher = LOPQSearcherLMDB(model=self.model,lmdb_path=self.model_lmdb_filename)

    def apply(self,vector,count=None):
        vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze()
        codes = self.model.predict(vector)
        if count:
            results = self.searcher.search(vector,quota=count)
        else:
            results = None
        return codes.coarse,codes.fine,results
class LOPQTrainer(object):
    def __init__(self, name, components, m, v, sub, dirname,
                 source_indexer_shasum):
        self.name = name
        self.n_components = int(components)
        self.m = int(m)
        self.v = int(v)
        self.dirname = dirname
        self.sub = int(sub)
        self.model = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.permuted_inds = None
        self.source_indexer_shasum = source_indexer_shasum

    def pca(self, training_data):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        sub vectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = training_data.shape
        mu = training_data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x),
                              training_data, np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def train(self, training_data):
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(training_data)
        training_data = self.pca_reduction.transform(training_data)
        self.P, self.mu = self.pca(training_data)
        training_data = training_data - self.mu
        training_data = np.dot(training_data, self.P)
        self.model = LOPQModel(V=self.v,
                               M=self.m,
                               subquantizer_clusters=self.sub)
        self.model.fit(training_data, n_init=1)  # replace self.data by train

    def save(self):
        model_proto_filename = "{}/model.proto".format(self.dirname)
        P_filename = "{}/model.P.npy".format(self.dirname)
        mu_filename = "{}/model.mu.npy".format(self.dirname)
        pca_filename = "{}/model.pca.pkl".format(self.dirname)
        permind_filename = "{}/model.permind.pkl".format(self.dirname)
        with open(model_proto_filename, 'w') as f:
            self.model.export_proto(f)
        with open(pca_filename, 'w') as out:
            pickle.dump(self.pca_reduction, out)
        with open(P_filename, 'w') as out:
            np.save(out, self.P)
        with open(mu_filename, 'w') as out:
            np.save(out, self.mu)
        with open(permind_filename, 'w') as out:
            pickle.dump(self.permuted_inds, out)
        j = {
            "name":
            self.name,
            "algorithm":
            "LOPQ",
            "shasum":
            hashlib.sha1(file(model_proto_filename).read()).hexdigest(),
            "model_type":
            "P",
            "arguments": {
                'm': self.m,
                'v': self.v,
                'sub': self.sub,
                'components': self.n_components,
                'indexer_shasum': self.source_indexer_shasum
            },
            "files": [{
                "filename": "model.proto",
                "url": "{}/model.proto".format(self.dirname)
            }, {
                "filename": "model.P.npy",
                "url": "{}/model.P.npy".format(self.dirname)
            }, {
                "filename": "model.mu.npy",
                "url": "{}/model.mu.npy".format(self.dirname)
            }, {
                "filename": "model.pca.pkl",
                "url": "{}/model.pca.pkl".format(self.dirname)
            }, {
                "filename": "model.permind.pkl",
                "url": "{}/model.permind.pkl".format(self.dirname)
            }]
        }
        return j
Пример #9
0
def main():
    """
    A brief demo script showing how to train various LOPQ models with brief
    discussion of trade offs.
    """

    # Get the oxford dataset
    data = load_oxford_data()

    # Compute PCA of oxford dataset. See README in data/oxford for details
    # about this dataset.
    P, mu = pca(data)

    # Mean center and rotate the data; includes dimension permutation.
    # It is worthwhile see how this affects recall performance. On this
    # dataset, which is already PCA'd from higher dimensional features,
    # this additional step to variance balance the dimensions typically
    # improves recall@1 by 3-5%. The benefit can be much greater depending
    # on the dataset.
    data = data - mu
    data = np.dot(data, P)

    # Create a train and test split. The test split will become
    # a set of queries for which we will compute the true nearest neighbors.
    train, test = train_test_split(data, test_size=0.2)

    # Compute distance-sorted neighbors in training set for each point in test set.
    # These will be our groundtruth for recall evaluation.
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1)

    # Note that we didn't specify a random seed for fitting the model, so different
    # runs will be different. You may also see a warning that some local projections
    # can't be estimated because too few points fall in a cluster. This is ok for the
    # purposes of this demo, but you might want to avoid this by increasing the amount
    # of training data or decreasing the number of clusters (the V hyperparameter).

    # With a model in hand, we can test it's recall. We populate a LOPQSearcher
    # instance with data and get recall stats. By default, we will retrieve 1000
    # ranked results for each query vector for recall evaluation.
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print('Recall (V=%d, M=%d, subquants=%d): %s' %
          (m.V, m.M, m.subquantizer_clusters, str(recall)))

    # We can experiment with other hyperparameters without discarding all
    # parameters everytime. Here we train a new model that uses the same coarse
    # quantizers but a higher number of subquantizers, i.e. we increase M.
    m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1)

    # Let's evaluate again.
    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print('Recall (V=%d, M=%d, subquants=%d): %s' %
          (m2.V, m2.M, m2.subquantizer_clusters, str(recall)))

    # The recall is probably higher. We got better recall with a finer quantization
    # at the expense of more data required for index items.

    # We can also hold both coarse quantizers and rotations fixed and see what
    # increasing the number of subquantizer clusters does to performance.
    m3 = LOPQModel(V=16,
                   M=8,
                   subquantizer_clusters=512,
                   parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print('Recall (V=%d, M=%d, subquants=%d): %s' %
          (m3.V, m3.M, m3.subquantizer_clusters, str(recall)))
Пример #10
0
Файл: lopq.py Проект: Aguin/CBVR

gtobj = GTOBJ()
relevant_labels_mapping = {
    'DSVR': ['ND', 'DS'],
    'CSVR': ['ND', 'DS', 'CS'],
    'ISVR': ['ND', 'DS', 'CS', 'IS'],
}

print('LOPQModel!')
start = time.time()
final_vids, features, vid2features = load_features(
    '/home/camp/FIVR/features/vcms_v1', is_gv=False)
# Define a model and fit it to data
model = LOPQModel(V=8, M=4)
model.fit(np.array(features).reshape(-1, 512))
# Create a searcher to index data with the model
searcher = LOPQSearcher(model)
searcher.add_data(features)
print('Read time: %.2f' % (time.time() - start))

# 加载特征
vids = list(vid2features.keys())
print(vids[:10])
global_features = np.squeeze(
    np.asarray(list(vid2features.values()), np.float32))
print(np.shape(global_features))

# 加载vid2name 和 name2vid
with open('/home/camp/FIVR/vid2name.pk', 'rb') as pk_file:
    vid2names = pk.load(pk_file)
Пример #11
0
class LOPQRetriever(BaseRetriever):
    def __init__(self, name, args):
        data = []
        self.name = name
        self.fnames = args.get('fnames', [])
        self.entries = []
        for fname in self.fnames:
            nmat = np.load(fname)
            if nmat.ndim > 2:
                logging.info("squeezing  shape {} with dimensions {}".format(
                    nmat.shape, nmat.ndim))
                nmat = nmat.squeeze(axis=1)
            elif nmat.ndim == 1:
                logging.info("expanding  shape {} with dimensions {}".format(
                    nmat.shape, nmat.ndim))
                nmat = np.expand_dims(nmat, axis=0)
            else:
                logging.info(
                    "keeping same  shape {} with dimensions {}".format(
                        nmat.shape, nmat.ndim))
            data.append(nmat)
            for e in json.load(file(fname.replace('npy', 'json'))):
                self.entries.append(e)
        if data:
            if len(data) > 1:
                self.data = np.concatenate(data)
            else:
                self.data = data[0]
            logging.info(self.data.shape)
        self.test_mode = args.get('test_mode', False)
        self.n_components = int(args['components'])
        self.m = int(args['m'])
        self.v = int(args['v'])
        self.sub = int(args['sub'])
        self.model = None
        self.searcher = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.model_proto_filename = args['proto_filename']
        self.P_filename = args['proto_filename'].replace('.proto', '.P.npy')
        self.mu_filename = args['proto_filename'].replace('.proto', '.mu.npy')
        self.pca_filename = args['proto_filename'].replace(
            '.proto', '.pca.pkl')
        self.model_lmdb_filename = args['proto_filename'].replace(
            '.proto', '_lmdb')
        self.permuted_inds_filename = args['proto_filename'].replace(
            '.proto', '.permuted_inds.pkl')
        self.permuted_inds = None

    def pca(self):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        subvectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = self.data.shape
        mu = self.data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data,
                              np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def cluster(self):
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(self.data)
        self.data = self.pca_reduction.transform(self.data)
        self.P, self.mu = self.pca()
        self.data = self.data - self.mu
        self.data = np.dot(self.data, self.P)
        # train, test = train_test_split(self.data, test_size=0.2)
        self.model = LOPQModel(V=self.v,
                               M=self.m,
                               subquantizer_clusters=self.sub)
        self.model.fit(self.data, n_init=1)  # replace self.data by train
        for i, e in enumerate(
                self.entries):  # avoid doing this twice again in searcher
            r = self.model.predict(self.data[i])
            e['coarse'] = r.coarse
            e['fine'] = r.fine
            e['index'] = i
        self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
        # if self.test_mode:
        #     self.searcher.add_data(train)
        #     nns = compute_all_neighbors(test, train)
        #     recall, _ = get_recall(self.searcher, test, nns)
        #     print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))
        # else:
        self.searcher.add_data(self.data)

    def find(self):
        i, selected = random.choice([k for k in enumerate(self.entries)])
        print selected
        for k in self.searcher.get_result_quota(self.data[i], 10):
            print k

    def save(self):
        with open(self.model_proto_filename, 'w') as f:
            self.model.export_proto(f)
            with open(self.pca_filename, 'w') as out:
                pickle.dump(self.pca_reduction, out)
            with open(self.P_filename, 'w') as out:
                np.save(out, self.P)
            with open(self.mu_filename, 'w') as out:
                np.save(out, self.mu)
            with open(self.permuted_inds_filename, 'w') as out:
                pickle.dump(self.permuted_inds, out)
            self.searcher.env.close()

    def load(self):
        self.model = LOPQModel.load_proto(self.model_proto_filename)
        self.pca_reduction = pickle.load(file(self.pca_filename))
        self.P = np.load(file(self.P_filename))
        self.mu = np.load(file(self.mu_filename))
        self.permuted_inds = np.load(file(self.permuted_inds_filename))
        self.searcher = LOPQSearcherLMDB(model=self.model,
                                         lmdb_path=self.model_lmdb_filename)

    def apply(self, vector, count=None):
        vector = np.dot((self.pca_reduction.transform(vector) - self.mu),
                        self.P).transpose().squeeze()
        codes = self.model.predict(vector)
        if count:
            results = self.searcher.search(vector, quota=count)
        else:
            results = None
        return codes.coarse, codes.fine, results

    def nearest(self, vector=None, n=12, retriever_pk=None, entry_getter=None):
        results = []
        coarse, fine, results_indexes = self.apply(vector, n)
        for i, k in enumerate(results_indexes[0]):
            e = entry_getter(k.id, retriever_pk)
            if e.detection_id:
                results.append({
                    'rank': i + 1,
                    'dist': i,
                    'detection_primary_key': e.detection_id,
                    'frame_index': e.frame.frame_index,
                    'frame_primary_key': e.frame_id,
                    'video_primary_key': e.video_id,
                    'type': 'detection',
                })
            else:
                results.append({
                    'rank': i + 1,
                    'dist': i,
                    'frame_index': e.frame.frame_index,
                    'frame_primary_key': e.frame_id,
                    'video_primary_key': e.video_id,
                    'type': 'frame',
                })
        return results
Пример #12
0
def main():
    """
    A brief demo script showing how to train various LOPQ models with brief
    discussion of trade offs.
    """

    # Get the oxford dataset
    data = load_oxford_data()

    # Compute PCA of oxford dataset. See README in data/oxford for details
    # about this dataset.
    P, mu = pca(data)

    # Mean center and rotate the data; includes dimension permutation.
    # It is worthwhile see how this affects recall performance. On this
    # dataset, which is already PCA'd from higher dimensional features,
    # this additional step to variance balance the dimensions typically
    # improves recall@1 by 3-5%. The benefit can be much greater depending
    # on the dataset.
    data = data - mu
    data = np.dot(data, P)

    # Create a train and test split. The test split will become
    # a set of queries for which we will compute the true nearest neighbors.
    train, test = train_test_split(data, test_size=0.2)

    # Compute distance-sorted neighbors in training set for each point in test set.
    # These will be our groundtruth for recall evaluation.
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1)

    # Note that we didn't specify a random seed for fitting the model, so different
    # runs will be different. You may also see a warning that some local projections
    # can't be estimated because too few points fall in a cluster. This is ok for the
    # purposes of this demo, but you might want to avoid this by increasing the amount
    # of training data or decreasing the number of clusters (the V hyperparameter).

    # With a model in hand, we can test it's recall. We populate a LOPQSearcher
    # instance with data and get recall stats. By default, we will retrieve 1000
    # ranked results for each query vector for recall evaluation.
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m.V, m.M, m.subquantizer_clusters, str(recall))

    # We can experiment with other hyperparameters without discarding all
    # parameters everytime. Here we train a new model that uses the same coarse
    # quantizers but a higher number of subquantizers, i.e. we increase M.
    m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1)

    # Let's evaluate again.
    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m2.V, m2.M, m2.subquantizer_clusters, str(recall))

    # The recall is probably higher. We got better recall with a finer quantization
    # at the expense of more data required for index items.

    # We can also hold both coarse quantizers and rotations fixed and see what
    # increasing the number of subquantizer clusters does to performance.
    m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m3.V, m3.M, m3.subquantizer_clusters, str(recall))
Пример #13
0
def main(new=True):
    # data: 3000 x 128dim
    if not new:
        # load data
        data = np.load('./data.npy')
    else:
        data = np.vstack((np.random.rand(1000, 128), np.random.rand(1000, 128) + 1, np.random.rand(1000, 128) - 1))
        print 'make data'
        # save data
        np.save('data.npy', data)

    # wanted to know this nearest neighbors
    x = np.ones(128) * 2

    print 'naive implementation'
    start = time.time()
    dist = np.sum(np.power((data - x), 2), axis=1)
    res = np.argsort(dist)
    print res[0:10] # return indices; top 10
    print time.time() - start, 's taken for naive NNsearch'

    model = None
    if not new:
        # load model
        model = LOPQModel.load_mat('params.mat')
    else:
        # Define a model and fit it to data
        model = LOPQModel(V=3, M=2, subquantizer_clusters=64)
        start = time.time()
        model.fit(data)
        print time.time() -start, 's taken for model fitting'
        # save model
        model.export_mat('params.mat')

    # Compute the LOPQ codes for a vector
    # if we define SC as subquantizer_clusters,
    # input vec(128dim); output: coarse codes(V, V), fine codes(SC, SC) because M = 2

    """
    for i in xrange(10):
        y = np.random.rand(128)
        code = model.predict(y)
        print 'output: ', code
    """

    # Create a searcher to index data with the model
    searcher = LOPQSearcher(model)
    searcher.add_data(data)

    start = time.time()
    # Retrieve ranked nearest neighbors
    nns = searcher.search(x, quota=10)
    ans = [nns[0][i][0] for i in range(10)]
    print ans
    print time.time() -start, 's taken for prediction top 10'

    count = 0
    for element in ans:
        if element in res[0:10]:
            count += 1
    else:
        print 'accuracy: ', count, '/', 10
Пример #14
0
class LOPQRetriever(BaseRetriever):
    def __init__(self, name, proto_filename, args, test_mode=False):
        super(BaseRetriever, self).__init__()
        self.name = name
        self.proto_filename = proto_filename
        self.entries = []
        self.test_mode = test_mode
        self.n_components = int(args['components'])
        self.m = int(args['m'])
        self.v = int(args['v'])
        self.sub = int(args['sub'])
        self.model = None
        self.searcher = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.permuted_inds = None
        self.model_proto_filename = proto_filename
        self.P_filename = proto_filename.replace('.proto', '.P.npy')
        self.entries_filename = proto_filename.replace('.proto', '.json')
        self.mu_filename = proto_filename.replace('.proto', '.mu.npy')
        self.pca_filename = proto_filename.replace('.proto', '.pca.pkl')
        self.model_lmdb_filename = proto_filename.replace('.proto', '_lmdb')
        self.permuted_inds_filename = proto_filename.replace(
            '.proto', '.permuted_inds.pkl')

    def pca(self):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        sub vectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = self.data.shape
        mu = self.data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data,
                              np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def cluster(self):
        self.data = self.index
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(self.data)
        self.data = self.pca_reduction.transform(self.data)
        self.P, self.mu = self.pca()
        self.data = self.data - self.mu
        self.data = np.dot(self.data, self.P)
        self.model = LOPQModel(V=self.v,
                               M=self.m,
                               subquantizer_clusters=self.sub)
        self.model.fit(self.data, n_init=1)  # replace self.data by train
        for i, e in enumerate(
                self.entries):  # avoid doing this twice again in searcher
            r = self.model.predict(self.data[i])
            e['coarse'] = r.coarse
            e['fine'] = r.fine
            e['index'] = i
        self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
        self.searcher.add_data(self.data)
        # cluster_codes = []
        # for e in c.entries:
        #     cc.video_id = e['video_primary_key']
        #     if 'detection_primary_key' in e:
        #         cc.detection_id = e['detection_primary_key']
        #         cc.frame_id = Region.objects.get(pk=cc.detection_id).frame_id
        #     else:
        #         cc.frame_id = e['frame_primary_key']
        #     cc.clusters = dc
        #     cc.coarse = e['coarse']
        #     cc.fine = e['fine']
        #     cc.coarse_text = " ".join(map(str, e['coarse']))
        #     cc.fine_text = " ".join(map(str, e['fine']))
        #     cc.searcher_index = e['index']
        #     cluster_codes.append(cc)

    def find(self):
        i, selected = random.choice([k for k in enumerate(self.entries)])
        print selected
        for k in self.searcher.get_result_quota(self.data[i], 10):
            print k

    def save(self):
        with open(self.model_proto_filename, 'w') as f:
            self.model.export_proto(f)
            with open(self.pca_filename, 'w') as out:
                pickle.dump(self.pca_reduction, out)
            with open(self.P_filename, 'w') as out:
                np.save(out, self.P)
            with open(self.mu_filename, 'w') as out:
                np.save(out, self.mu)
            with open(self.entries_filename, 'w') as out:
                json.dump(out, self.entries)
            with open(self.permuted_inds_filename, 'w') as out:
                pickle.dump(self.permuted_inds, out)
            self.searcher.env.close()

    def load(self):
        self.model = LOPQModel.load_proto(self.model_proto_filename)
        self.pca_reduction = pickle.load(file(self.pca_filename))
        self.P = np.load(file(self.P_filename))
        self.mu = np.load(file(self.mu_filename))
        self.permuted_inds = np.load(file(self.permuted_inds_filename))
        self.searcher = LOPQSearcherLMDB(model=self.model,
                                         lmdb_path=self.model_lmdb_filename)

    def apply(self, vector, count=None):
        vector = np.dot((self.pca_reduction.transform(vector) - self.mu),
                        self.P).transpose().squeeze()
        codes = self.model.predict(vector)
        if count:
            results = self.searcher.search(vector, quota=count)
        else:
            results = None
        return codes.coarse, codes.fine, results

    def nearest(self, vector=None, n=12, retriever_pk=None, entry_getter=None):
        results = []
        coarse, fine, results_indexes = self.apply(vector, n)
        for i, k in enumerate(results_indexes[0]):
            e = entry_getter(k.id, retriever_pk)
            if e.detection_id:
                results.append({
                    'rank': i + 1,
                    'dist': i,
                    'detection_primary_key': e.detection_id,
                    'frame_index': e.frame.frame_index,
                    'frame_primary_key': e.frame_id,
                    'video_primary_key': e.video_id,
                    'type': 'detection',
                })
            else:
                results.append({
                    'rank': i + 1,
                    'dist': i,
                    'frame_index': e.frame.frame_index,
                    'frame_primary_key': e.frame_id,
                    'video_primary_key': e.video_id,
                    'type': 'frame',
                })
        return results