def _add_data_to_index(self, data, indexes, batch_size): if type(indexes) is list: offset = indexes[0] elif isinstance(indexes, numbers.Number): offset = indexes else: offset = 0 for data_batch in np.split(data, batch_size, axis=0): indices = np.arange(len(data_batch), dtype=np.int32) + offset nmslib.addDataPointBatch(self.index, indices, data_batch) offset += data_batch.shape[0] return offset
def test_vector_load(fast=True, fast_batch=True, seq=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) f = '/tmp/foo.txt' if not os.path.isfile(f): print 'creating %s' % f np.savetxt(f, np.random.rand(100000,1000), delimiter="\t") print 'done' if fast: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast add data point'): data = read_data_fast(f) nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) nmslib.freeIndex(index) if fast_batch: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast_batch add data point'): offset = 0 for data in read_data_fast_batch(f, 10000): nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data) offset += data.shape[0] print 'offset', offset nmslib.freeIndex(index) if seq: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('seq add data point'): for id, data in enumerate(read_data(f)): nmslib.addDataPoint(index, id, data) nmslib.freeIndex(index)
def test_vector_load(fast=True, fast_batch=True, seq=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) f = '/tmp/foo.txt' if not os.path.isfile(f): print('creating %s' % f) np.savetxt(f, np.random.rand(100000,1000), delimiter="\t") print('done') if fast: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast add data point'): data = read_data_fast(f) nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) nmslib.freeIndex(index) if fast_batch: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast_batch add data point'): offset = 0 for data in read_data_fast_batch(f, 10000): nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data) offset += data.shape[0] print('offset', offset) nmslib.freeIndex(index) if seq: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('seq add data point'): for id, data in enumerate(read_data(f)): nmslib.addDataPoint(index, id, data) nmslib.freeIndex(index)
def test_add_points_batch5(self): positions = nmslib.addDataPointBatch( self.index, np.array([0, 1, 2], dtype=np.int32), np.array([[0.34, 0.54], [0.55, 0.52], [0.21, 0.68]], dtype=np.float32), ) nt.assert_array_equal(np.array([0, 1, 2], dtype=np.int32), positions)
def test_add_points_batch5(self): row = np.array([0, 0, 1, 2, 2]) col = np.array([0, 2, 1, 1, 2]) data = np.array([0.3, 0.2, 0.4, 0.1, 0.6]) m = csr_matrix((data, (row, col)), dtype=np.float32, shape=(3, 3)) print m.toarray() positions = nmslib.addDataPointBatch(self.index, np.array([0, 1, 2], dtype=np.int32), m) nt.assert_array_equal(np.array([0, 1, 2], dtype=np.int32), positions)
def test_string_fresh(batch=True): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_add_points_batch5(self): row = np.array([0, 0, 1, 2, 2]) col = np.array([0, 2, 1, 1, 2]) data = np.array([0.3, 0.2, 0.4, 0.1, 0.6]) m = csr_matrix((data, (row, col)), dtype=np.float32, shape=(3, 3)) print m.toarray() positions = nmslib.addDataPointBatch( self.index, np.array([0, 1, 2], dtype=np.int32), m) nt.assert_array_equal(np.array([0, 1, 2], dtype=np.int32), positions)
def test_string_fresh(batch=True): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch( index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0, min(MAX_PRINT_QTY, nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_add_points_batch5(self): data = np.array([[0.34, 0.54], [0.55, 0.52], [0.21, 0.68]], dtype=np.float32) positions = nmslib.addDataPointBatch( self.index, np.array([0, 1, 2], dtype=np.int32), data) nt.assert_array_equal(np.array([0, 1, 2], dtype=np.int32), positions)
def test_add_points_batch5(self): positions = nmslib.addDataPointBatch( self.index, np.array([0, 1, 2], dtype=np.int32), ["string1", "string2", "string3"]) nt.assert_array_equal(np.array([0, 1, 2], dtype=np.int32), positions)
def bench_sparse_vector(batch=True): # delay importing these so CI can import module from scipy.sparse import csr_matrix from scipy.spatial import distance from pysparnn.cluster_index import MultiClusterIndex dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print('dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0]) k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i, :])]) res.sort(key=lambda x: x[1]) print('q0 res', res[:k]) data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print(res[:5]) for i in res[0]: print(int(i), distance.cosine(q0, dataset[int(i), :])) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch( index, np.arange(len(dataset), dtype=np.int32), data_matrix) print('positions', positions) else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print(idx, v) if idx == 0: for i in v: print('q0', i, distance.cosine(q0, dataset[i, :])) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print(idx, res) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print('data.shape', data.shape) positions = nmslib.addDataPointBatch( index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print('id %s != pos %s' % (id, pos)) sys.exit(1) end = time.time() print('added data in %s secs' % (end - start)) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) print("Distance between points (0,0) " + str(nmslib.getDistance(index, 0, 0))) print("Distance between points (1,1) " + str(nmslib.getDistance(index, 1, 1))) print("Distance between points (0,1) " + str(nmslib.getDistance(index, 0, 1))) print("Distance between points (1,0) " + str(nmslib.getDistance(index, 1, 0))) for i in range(0, min(MAX_PRINT_QTY, nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print(idx, v) else: for idx, data in enumerate(read_data('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) end = time.time() print('querying done in %s secs' % (end - start)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print 'data.shape', data.shape positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) end = time.time() print 'added data in %s secs' % (end - start) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print idx, v else: for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) end = time.time() print 'querying done in %s secs' % (end - start) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def bench_sparse_vector(batch=True): dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print 'dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0] k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i,:])]) res.sort(key=lambda x: x[1]) print 'q0 res', res[:k] data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = snn.MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print res[:5] for i in res[0]: print int(i), distance.cosine(q0, dataset[int(i),:]) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix) print 'positions', positions else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print idx, v if idx == 0: for i in v: print 'q0', i, distance.cosine(q0, dataset[i,:]) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print idx, res nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_add_points_batch5(self): positions = nmslib.addDataPointBatch(self.index, np.array([0, 1, 2], dtype=np.int32), ["string1", "string2", "string3"]) nt.assert_array_equal(np.array([0, 1, 2], dtype=np.int32), positions)