def test_string_loaded(): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the loaded index:" k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) if os.path.exists(self._index_name): print "Loading index from file" nmslib.loadIndex(self._index, self._index_name) else: nmslib.createIndex(self._index, self._index_param) if self._save_index: nmslib.saveIndex(self._index, self._index_name) nmslib.setQueryTimeParams(self._index, self._query_param)
def test_string_fresh(batch=True): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_add_points(self): self.assertEqual( 0, nmslib.addDataPoint(self.index, 1000, [[0, 0.5], [5, 0.3], [6, 0.4]])) self.assertEqual( 1, nmslib.addDataPoint(self.index, 1001, [[0, 0.5], [3, 0.3], [5, 0.4]]))
def test_string_fresh(batch=True): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch( index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0, min(MAX_PRINT_QTY, nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def fit(self, X): if self._method_name == 'vptree': self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init('l2', [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) nmslib.createIndex(self._index, self._method_param)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_vector_load(fast=True, fast_batch=True, seq=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) f = '/tmp/foo.txt' if not os.path.isfile(f): print('creating %s' % f) np.savetxt(f, np.random.rand(100000,1000), delimiter="\t") print('done') if fast: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast add data point'): data = read_data_fast(f) nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) nmslib.freeIndex(index) if fast_batch: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast_batch add data point'): offset = 0 for data in read_data_fast_batch(f, 10000): nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data) offset += data.shape[0] print('offset', offset) nmslib.freeIndex(index) if seq: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('seq add data point'): for id, data in enumerate(read_data(f)): nmslib.addDataPoint(index, id, data) nmslib.freeIndex(index)
def test_vector_load(fast=True, fast_batch=True, seq=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) f = '/tmp/foo.txt' if not os.path.isfile(f): print 'creating %s' % f np.savetxt(f, np.random.rand(100000,1000), delimiter="\t") print 'done' if fast: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast add data point'): data = read_data_fast(f) nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) nmslib.freeIndex(index) if fast_batch: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast_batch add data point'): offset = 0 for data in read_data_fast_batch(f, 10000): nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data) offset += data.shape[0] print 'offset', offset nmslib.freeIndex(index) if seq: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('seq add data point'): for id, data in enumerate(read_data(f)): nmslib.addDataPoint(index, id, data) nmslib.freeIndex(index)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) nmslib.createIndex(self._index, self._method_param)
def test_string_loaded(): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.loadIndex(index, index_name) print("The index %s is loaded" % index_name) nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the loaded index:") k = 2 for idx, data in enumerate(QUERY_STRS): print(idx, nmslib.knnQuery(index, k, data)) nmslib.freeIndex(index)
def test_sparse_vector_fresh(): space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')): nmslib.addDataPoint(index, id, data) print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_sparse_vector_fresh(): space_type = 'cosinesimil_sparse' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def fit(self, X): import nmslib self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) if os.path.exists(self._index_name): logging.debug("Loading index from file") nmslib.loadIndex(self._index, self._index_name) else: logging.debug("Create Index") nmslib.createIndex(self._index, self._index_param) if self._save_index: nmslib.saveIndex(self._index, self._index_name) nmslib.setQueryTimeParams(self._index, self._query_param)
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the loaded index" k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print('id %s != pos %s' % (id, pos)) sys.exit(1) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') query_time_param = ['efSearch=50'] nmslib.loadIndex(index, index_name) print("The index %s is loaded" % index_name) nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the loaded index") k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.freeIndex(index)
def test_add_points(self): self.assertEqual(0, nmslib.addDataPoint(self.index, 1000, [0.5, 0.3, 0.4])) self.assertEqual(1, nmslib.addDataPoint(self.index, 1001, [0.5, 0.3, 0.4]))
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print('data.shape', data.shape) positions = nmslib.addDataPointBatch( index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print('id %s != pos %s' % (id, pos)) sys.exit(1) end = time.time() print('added data in %s secs' % (end - start)) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) print("Distance between points (0,0) " + str(nmslib.getDistance(index, 0, 0))) print("Distance between points (1,1) " + str(nmslib.getDistance(index, 1, 1))) print("Distance between points (0,1) " + str(nmslib.getDistance(index, 0, 1))) print("Distance between points (1,0) " + str(nmslib.getDistance(index, 1, 0))) for i in range(0, min(MAX_PRINT_QTY, nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print(idx, v) else: for idx, data in enumerate(read_data('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) end = time.time() print('querying done in %s secs' % (end - start)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_add_points(self): self.assertEqual( 0, nmslib.addDataPoint(self.index, 1000, [0.5, 0.3, 0.4])) self.assertEqual( 1, nmslib.addDataPoint(self.index, 1001, [0.5, 0.3, 0.4]))
def test_add_points(self): self.assertEqual(0, nmslib.addDataPoint(self.index, 1000, "string1")) self.assertEqual(1, nmslib.addDataPoint(self.index, 1001, "string2"))
def bench_sparse_vector(batch=True): # delay importing these so CI can import module from scipy.sparse import csr_matrix from scipy.spatial import distance from pysparnn.cluster_index import MultiClusterIndex dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print('dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0]) k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i, :])]) res.sort(key=lambda x: x[1]) print('q0 res', res[:k]) data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print(res[:5]) for i in res[0]: print(int(i), distance.cosine(q0, dataset[int(i), :])) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch( index, np.arange(len(dataset), dtype=np.int32), data_matrix) print('positions', positions) else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print(idx, v) if idx == 0: for i in v: print('q0', i, distance.cosine(q0, dataset[i, :])) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print(idx, res) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print 'data.shape', data.shape positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) end = time.time() print 'added data in %s secs' % (end - start) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print idx, v else: for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) end = time.time() print 'querying done in %s secs' % (end - start) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def bench_sparse_vector(batch=True): dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print 'dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0] k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i,:])]) res.sort(key=lambda x: x[1]) print 'q0 res', res[:k] data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = snn.MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print res[:5] for i in res[0]: print int(i), distance.cosine(q0, dataset[int(i),:]) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix) print 'positions', positions else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print idx, v if idx == 0: for i in v: print 'q0', i, distance.cosine(q0, dataset[i,:]) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print idx, res nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_add_points(self): self.assertEqual(0, nmslib.addDataPoint(self.index, 1000, [[0, 0.5], [5, 0.3], [6, 0.4]])) self.assertEqual(1, nmslib.addDataPoint(self.index, 1001, [[0, 0.5], [3, 0.3], [5, 0.4]]))