def test_string_loaded(): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the loaded index:" k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_vector_load(fast=True, fast_batch=True, seq=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) f = '/tmp/foo.txt' if not os.path.isfile(f): print 'creating %s' % f np.savetxt(f, np.random.rand(100000,1000), delimiter="\t") print 'done' if fast: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast add data point'): data = read_data_fast(f) nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) nmslib.freeIndex(index) if fast_batch: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast_batch add data point'): offset = 0 for data in read_data_fast_batch(f, 10000): nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data) offset += data.shape[0] print 'offset', offset nmslib.freeIndex(index) if seq: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('seq add data point'): for id, data in enumerate(read_data(f)): nmslib.addDataPoint(index, id, data) nmslib.freeIndex(index)
def test_string_fresh(batch=True): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def setUp(self): space_type = "normleven" space_param = [] method_name = "small_world_rand" index_name = method_name + ".index" if os.path.isfile(index_name): os.remove(index_name) self.index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT )
def setUp(self): space_type = "cosinesimil_sparse" space_param = [] method_name = "small_world_rand" index_name = method_name + ".index" if os.path.isfile(index_name): os.remove(index_name) self.index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT )
def fit(self, Ciu): # nmslib can be a little chatty when first imported, disable some of # the logging logging.getLogger('nmslib').setLevel(logging.WARNING) import nmslib # train the model super(NMSLibAlternatingLeastSquares, self).fit(Ciu) # create index for similar_items if self.approximate_similar_items: log.debug("Building nmslib similar items index") self.similar_items_index = nmslib.init( method=self.method, space='cosinesimil') # there are some numerical instability issues here with # building a cosine index with vectors with 0 norms, hack around this # by just not indexing them norms = numpy.linalg.norm(self.item_factors, axis=1) ids = numpy.arange(self.item_factors.shape[0]) # delete zero valued rows from the matrix item_factors = numpy.delete(self.item_factors, ids[norms == 0], axis=0) ids = ids[norms != 0] self.similar_items_index.addDataPointBatch(item_factors, ids=ids) self.similar_items_index.createIndex(self.index_params, print_progress=self.show_progress) self.similar_items_index.setQueryTimeParams(self.query_params) # build up a separate index for the inner product (for recommend # methods) if self.approximate_recommend: log.debug("Building nmslib recommendation index") self.max_norm, extra = augment_inner_product_matrix( self.item_factors) self.recommend_index = nmslib.init( method='hnsw', space='cosinesimil') self.recommend_index.addDataPointBatch(extra) self.recommend_index.createIndex(self.index_params, print_progress=self.show_progress) self.recommend_index.setQueryTimeParams(self.query_params)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) self._index.addDataPointBatch(X) nmslib.createIndex(self._index, self._method_param)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def setUp(self): space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) self.index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT)
def setUp(self): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) self.index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) nmslib.createIndex(self._index, self._method_param)
def test_sparse_vector_fresh(): space_type = 'cosinesimil_sparse' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def testSparse(self): index = nmslib.init(method='small_world_rand', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) index.addDataPoint(0, [(1, 2.), (2, 3.)]) index.addDataPoint(1, [(0, 1.), (1, 2.)]) index.addDataPoint(2, [(2, 3.), (3, 3.)]) index.addDataPoint(3, [(3, 1.)]) index.createIndex() ids, distances = index.knnQuery([(1, 2.), (2, 3.)]) self.assertEqual(ids[0], 0) self.assertEqual(distances[0], 0) self.assertEqual(len(index), 4) self.assertEqual(index[3], [(3, 1.0)])
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the loaded index" k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) self._index.addDataPointBatch(X) if os.path.exists(self._index_name): print('Loading index from file') self._index.loadIndex(self._index_name) else: self._index.createIndex(self._index_param) if self._save_index: self._index.saveIndex(self._index_name) self._index.setQueryTimeParams(self._query_param)
def testStringLeven(self): index = nmslib.init(space='leven', dtype=nmslib.DistType.INT, data_type=nmslib.DataType.OBJECT_AS_STRING, method='small_world_rand') strings = [''.join(x) for x in itertools.permutations(['a', 't', 'c', 'g'])] index.addDataPointBatch(strings) index.addDataPoint(len(index), "atat") index.addDataPoint(len(index), "gaga") index.createIndex() for i, distance in zip(*index.knnQuery(strings[0])): self.assertEqual(index.getDistance(0, i), distance) self.assertEqual(len(index), len(strings) + 2) self.assertEqual(index[0], strings[0]) self.assertEqual(index[len(index)-2], 'atat')
def bench_sparse_vector(batch=True): # delay importing these so CI can import module from scipy.sparse import csr_matrix from scipy.spatial import distance from pysparnn.cluster_index import MultiClusterIndex dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print('dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0]) k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i, :])]) res.sort(key=lambda x: x[1]) print('q0 res', res[:k]) data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print(res[:5]) for i in res[0]: print(int(i), distance.cosine(q0, dataset[int(i), :])) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch( index, np.arange(len(dataset), dtype=np.int32), data_matrix) print('positions', positions) else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print(idx, v) if idx == 0: for i in v: print('q0', i, distance.cosine(q0, dataset[i, :])) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print(idx, res) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
import time import math import pdb from xclib.data import data_utils import hnswlib lbl_ft_file = sys.argv[1] model_file = sys.argv[2] M = int(sys.argv[3]) efC = int(sys.argv[4]) num_threads = int(sys.argv[5]) num_ft = int(sys.argv[6]) metric_space = sys.argv[7] start = time.time() data = data_utils.read_sparse_file(lbl_ft_file) end = time.time() start = time.time() index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) index.addDataPointBatch(data) index.createIndex({ 'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC }) end = time.time() print('Training time of ANNS datastructure = %f' % (end - start)) nmslib.saveIndex(index, model_file)
import dlib, os, shutil import numpy as np from skimage import io from scipy.spatial import distance import pickle import nmslib index = nmslib.init(method='hnsw', space='l2', data_type=nmslib.DataType.DENSE_VECTOR) files = os.listdir('npy') es = [] e=0 ff=open('associations.txt', 'w') for x in files: e=e+1 name, _ = os.path.splitext(x) embedding=np.load('npy/'+x) ff.write(str(e)+'|'+x+'\n') index.addDataPoint(e, embedding) index_time_params = { 'indexThreadQty': 4, 'skip_optimized_index': 0, 'post': 2, 'delaunay_type': 1, 'M': 100, 'efConstruction': 2000
def create_and_search_index(retcfg, jobs): batch_size = -1 q_features = load_features(retcfg['path']['qfeature']) q_namelist = np.loadtxt(retcfg['path']['qlist'], dtype=dict(names=('qname', 'nfeat'), formats=('U100', np.int32))) assert q_features.shape[0] == np.sum(q_namelist['nfeat']), "Inconsistent number of features sum and size of" \ "query features array" norm = retcfg.get('feature', 'norm', fallback=None) db_features = load_features(retcfg['path']['dbfeature']) if norm: db_features = normalize(db_features, norm) q_features = normalize(q_features, norm) outdir = retcfg['path']['outdir'] + "queryfiles/" safe_create_dir(outdir) index_type = retcfg['index']['index_type'] dist_type = retcfg['index']['dist_type'] knn = retcfg.getint('search', 'knn') M = retcfg.getint('index', 'M', fallback=20) efC = retcfg.getint('index', 'efC', fallback=20) print(" -- Creating <{0:s}> NN index".format(index_type)) print(" -> KNN: {0:d}".format(knn)) print(" -> Metric: {0:s}\n".format(dist_type)) nnidx = nmslib.init(method=index_type, space=dist_type) nnidx.addDataPointBatch(db_features) del db_features nnidx.createIndex({'post': 2}, print_progress=True) nnidx.setQueryTimeParams({'efSearch': knn}) if batch_size == -1: batch_size = q_features.shape[0] n_batches = int(np.ceil(q_features.shape[0] / batch_size)) for i in tqdm(range(n_batches), ncols=100, desc='Batch', total=n_batches): s = i * batch_size e = s + batch_size batch_q_features = q_features[s:e] neighbours = nnidx.knnQueryBatch(batch_q_features, k=10, num_threads=jobs) neighbours = list(zip(*neighbours)) indices = np.array(neighbours[0]) distances = np.array(neighbours[1]) s = 0 for qname, n in q_namelist: qdists = distances[s:s + n] qidx = indices[s:s + n] matchfpath = "{0:s}{1:s}.matches".format(outdir, qname) distfpath = "{0:s}{1:s}.dist".format(outdir, qname) print(qname, "-> ", s, ":", s + n) print(" |_ dists: ", qdists.shape) print(" |_ indices: ", qidx.shape, end="\n---\n") np.save(matchfpath + ".npy", qidx) np.save(distfpath + ".npy", qdists) s += n print("---", flush=True)
import nmslib import utils external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] app = dash.Dash(__name__, external_stylesheets=external_stylesheets) server = app.server # ------------- loading index ----------- os.system("setup.py") # reading the arxiv data df = pd.read_csv("data/arxiv_smaller.csv") index_title = nmslib.init(method='hnsw', space='cosinesimil') index_author = nmslib.init(method='hnsw', space='cosinesimil') index_categories = nmslib.init(method='hnsw', space='cosinesimil') index_title.loadIndex("index_title.bin") index_author.loadIndex("index_author.bin") index_categories.loadIndex("index_categories.bin") # ------------- Define layout for the app ---------------- app.layout = html.Div([ dcc.Tabs(id='tabs-nav', value='tab-1', children=[ dcc.Tab(label='Search engine', value='tab-1'), dcc.Tab(label='Data', value='tab-2'),
def fit(self, data): self.index = nmslib.init(method=self.method, space='cosinesimil') self.index.addDataPointBatch(data) self.index.createIndex(self.indexparams, print_progress=True)
algorithm = [] construciotnTimes=[] searchTimes=[] reacll = [] k = 100 avgdistances = [] constructionClocks = [] searchClocks = [] clockAlg = [] ##vp-tree import nmslib vptree = nmslib.init(method='vptree', space='l2') startTime = process_time() vptree.addDataPointBatch(train) vptree.createIndex({'bucketSize' : 10000,'selectPivotAttempts':10}) end_time = process_time() constructionTime = end_time - startTime # get all nearest neighbours for all the datapoint # using a pool of 4 threads to compute for maxLeave in [30]:#[2,10,15,20,25]: vptree.setQueryTimeParams({'maxLeavesToVisit':maxLeave,'alphaLeft':1.1,'alphaRight':1.1}) startTime = process_time() neighbours = vptree.knnQueryBatch(query,k=100, num_threads=2 ) end_time = process_time()
def find_edges(input, test, K, cluster_ids, query_ids): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC, num_threads = 30, 100, 10 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_name = 'cosinesimil_sparse' data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ") tree.addDataPointBatch(input) tree.createIndex(index_time_params) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params) tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((query_ids[index1], center_ids[index2])) print(f"\tdone! .... time={time.time()-st_time:.3f}s") return edge_list
model_config.inputs['share']['preprocess']) else: pre = engine.load_preprocessor(preprocess_dir, model_config.net_name) model_config.inputs['share']['custom_corpus'] = os.path.join( base_dir, model_config.inputs['share']['custom_corpus']) docs, embeds = build_document_embeddings(config) logger.info("Loading search index...") index_name = 'custom_index' if not os.path.exists(index_name): logger.info("Search index not found. Building it...") search_engine = build_search_index(embeds) search_engine.saveIndex(index_name) else: search_engine = nmslib.init(method='hnsw', space='cosinesimil') search_engine.loadIndex(index_name) logger.info("Model ready to query.") @hug.cli() @hug.get(examples='query=how%20to%20connect%20to%20printer') @hug.local() def search(query: hug.types.text): sparse_input = pre.transform_list([query])[0] sparse_input = np.expand_dims(sparse_input, axis=0) dense_input = embed_model.predict(sparse_input)[0] idxs, dists = search_engine.knnQuery(dense_input, k=3) res = []
c2v.vectorize_dict(dict_path, encoding_type=encoding_type) if not os.path.exists('./dict_index_{}.bin'.format(emb_size)): c2v.create_index(emb_size) """Loading necessary resources""" dictionnary = [] with open(dict_path, 'r', encoding=encoding_type) as file: for line in file: dictionnary.append(line.strip()) if dict_path.endswith(".json"): dictionnary = json.load(file) else: dictionnary = [] for line in file: dictionnary.append(line.strip()) index = nmslib.init(method="hnsw", space="cosinesimil") index.loadIndex('./dict_index_{}.bin'.format(emb_size)) c2v_model = c2v.load_model("train_fr_150") import time if len(sys.argv) == 1: """ K-nearest-neigbors search""" print("\nEdit distance 1:") stamp = time.time() requests1 = [] requests1.append(c2v.find_knn("langage", dictionnary, c2v_model, index)) requests1.append(c2v.find_knn("langqge", dictionnary, c2v_model, index)) requests1.append(c2v.find_knn("langagee", dictionnary, c2v_model, index)) time1 = (time.time() - stamp)
def nmslib_init(): """Initializes an nmslib index object""" index = nmslib.init(method='hnsw', space='cosinesimil') return index
return relevant / total if __name__ == '__main__': # load data annoy_metrics = 'angular' annoy_metrics = 'euclidean' start_scratch = True if start_scratch: df = pd.read_csv('user_factor.csv', header=None) df = df.values[:, 1:] num_users, ranks = df.shape t = AnnoyIndex(ranks, metric=annoy_metrics) t.load('tree_50') space_name = 'l2' index = nmslib.init(method='hnsw', space=space_name) index.addDataPointBatch(df) index.loadIndex('hnsw_index80.bin') # Set index parameters # These are the most important onese NN = 50 efC = 100 num_threads = 4 index_time_params = { 'NN': NN, 'indexThreadQty': num_threads, 'efConstruction': efC }
def shortcut_search_query(user_input_query, user_input_program, user_input_device): index = nmslib.init(method='hnsw', space='cosinesimil') index.loadIndex('src/models/sparse_index_word2vec_shortcut_search.bin', load_data=True) ids = shortcut_query(user_input_query, user_input_program, user_input_device, index) return ids
(bm25.doc_len[i] / bm25.avgdl)) + bm25.doc_freqs[i][word]) weighted_vector = vector * weight doc_vector.append(weighted_vector) doc_vector_mean = np.mean(doc_vector, axis=0) weighted_doc_vects.append(doc_vector_mean) # Save vectors pickle.dump(weighted_doc_vects, open("models/weighted_doc_vects_" + searchname + "_.p", "wb")) #save the results to disc # create a matrix from our document vectors data = np.vstack(weighted_doc_vects) # initialize a new index, using a HNSW index on Cosine Similarity index = nmslib.init(method='hnsw', space='cosinesimil') index.addDataPointBatch(data) index.createIndex({'post': 2}, print_progress=True) # Search function def besceaSearch(query_text): output_list = [] input = query_text.lower().split() query = [ft_model[vec] for vec in input] query = np.mean(query, axis=0) t0 = time.time() ids, distances = index.knnQuery(query, k=return_results_count) t1 = time.time() print(f'Searched {df.shape[0]} records in {round(t1-t0,4) } seconds \n') for i, j in zip(ids, distances):
def predict_topk(biosyn, eval_dictionary, eval_queries, topk, score_mode='hybrid', type_given=False): """ Parameters ---------- score_mode : str hybrid, dense, sparse """ encoder = biosyn.get_dense_encoder() tokenizer = biosyn.get_dense_tokenizer() sparse_encoder = biosyn.get_sparse_encoder() sparse_weight = biosyn.get_sparse_weight().item() # must be scalar value # useful if we're conditioning on types all_indv_types = [x for t in eval_dictionary[:, 1] for x in t.split('|')] unique_types = np.unique(all_indv_types).tolist() v_check_type = np.vectorize(check_label) inv_idx = { t: v_check_type(eval_dictionary[:, 1], t).nonzero()[0] for t in unique_types } # embed dictionary dict_sparse_embeds = biosyn.embed_sparse(names=eval_dictionary[:, 0], show_progress=True) dict_dense_embeds = biosyn.embed_dense(names=eval_dictionary[:, 0], show_progress=True) # build the sparse index if not type_given: sparse_index = nmslib.init(method='hnsw', space='negdotprod_sparse_fast', data_type=nmslib.DataType.SPARSE_VECTOR) sparse_index.addDataPointBatch(dict_sparse_embeds) sparse_index.createIndex({'post': 2}, print_progress=False) else: sparse_index = {} for sty, indices in inv_idx.items(): sparse_index[sty] = nmslib.init( method='hnsw', space='negdotprod_sparse_fast', data_type=nmslib.DataType.SPARSE_VECTOR) sparse_index[sty].addDataPointBatch(dict_sparse_embeds[indices]) sparse_index[sty].createIndex({'post': 2}, print_progress=False) # build the dense index d = dict_dense_embeds.shape[1] if not type_given: nembeds = dict_dense_embeds.shape[0] if nembeds < 10000: # if the number of embeddings is small, don't approximate dense_index = faiss.IndexFlatIP(d) dense_index.add(dict_dense_embeds) else: nlist = int(math.floor( math.sqrt(nembeds))) # number of quantized cells nprobe = int(math.floor( math.sqrt(nlist))) # number of the quantized cells to probe quantizer = faiss.IndexFlatIP(d) dense_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) dense_index.train(dict_dense_embeds) dense_index.add(dict_dense_embeds) dense_index.nprobe = nprobe else: dense_index = {} for sty, indices in inv_idx.items(): sty_dict_dense_embeds = dict_dense_embeds[indices] nembeds = sty_dict_dense_embeds.shape[0] if nembeds < 10000: # if the number of embeddings is small, don't approximate dense_index[sty] = faiss.IndexFlatIP(d) dense_index[sty].add(sty_dict_dense_embeds) else: nlist = int(math.floor( math.sqrt(nembeds))) # number of quantized cells nprobe = int(math.floor(math.sqrt( nlist))) # number of the quantized cells to probe quantizer = faiss.IndexFlatIP(d) dense_index[sty] = faiss.IndexIVFFlat( quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) dense_index[sty].train(sty_dict_dense_embeds) dense_index[sty].add(sty_dict_dense_embeds) dense_index[sty].nprobe = nprobe # respond to mention queries queries = [] for eval_query in tqdm(eval_queries, total=len(eval_queries)): mentions = eval_query[0].replace("+", "|").split("|") golden_cui = eval_query[1].replace("+", "|") golden_sty = eval_query[2].replace("+", "|") pmid = eval_query[3] start_char = eval_query[4] end_char = eval_query[5] dict_mentions = [] for mention in mentions: mention_sparse_embeds = biosyn.embed_sparse( names=np.array([mention])) mention_dense_embeds = biosyn.embed_dense( names=np.array([mention])) # search the sparse index if not type_given: sparse_nn = sparse_index.knnQueryBatch(mention_sparse_embeds, k=topk, num_threads=20) else: sparse_nn = sparse_index[golden_sty].knnQueryBatch( mention_sparse_embeds, k=topk, num_threads=20) sparse_idxs, _ = zip(*sparse_nn) s_candidate_idxs = np.asarray(sparse_idxs) if type_given: # reverse mask index mapping s_candidate_idxs = inv_idx[golden_sty][s_candidate_idxs] s_candidate_idxs = s_candidate_idxs.astype(np.int64) # search the dense index if not type_given: _, d_candidate_idxs = dense_index.search( mention_dense_embeds, topk) else: _, d_candidate_idxs = dense_index[golden_sty].search( mention_dense_embeds, topk) # reverse mask index mapping d_candidate_idxs = inv_idx[golden_sty][d_candidate_idxs] d_candidate_idxs = d_candidate_idxs.astype(np.int64) # get the reduced candidate set reduced_candidate_idxs = np.unique( np.hstack([ s_candidate_idxs.reshape(-1, ), d_candidate_idxs.reshape(-1, ) ])) # get score matrix sparse_score_matrix = biosyn.get_score_matrix( query_embeds=mention_sparse_embeds, dict_embeds=dict_sparse_embeds[ reduced_candidate_idxs, :]).todense() dense_score_matrix = biosyn.get_score_matrix( query_embeds=mention_dense_embeds, dict_embeds=dict_dense_embeds[reduced_candidate_idxs, :]) if score_mode == 'hybrid': score_matrix = sparse_weight * sparse_score_matrix + dense_score_matrix elif score_mode == 'dense': score_matrix = dense_score_matrix elif score_mode == 'sparse': score_matrix = sparse_score_matrix else: raise NotImplementedError() # take care of getting the best indices candidate_idxs = biosyn.retrieve_candidate( score_matrix=score_matrix, topk=topk) candidate_idxs = reduced_candidate_idxs[candidate_idxs] np_candidates = eval_dictionary[candidate_idxs].squeeze() dict_candidates = [] for np_candidate in np_candidates: dict_candidates.append({ 'name': np_candidate[0], 'sty': np_candidate[1], 'cui': np_candidate[2], 'label': check_label(np_candidate[2], golden_cui) }) dict_mentions.append({ 'mention': mention, 'golden_cui': golden_cui, # golden_cui can be composite cui 'pmid': pmid, 'start_char': start_char, 'end_char': end_char, 'candidates': dict_candidates }) queries.append({'mentions': dict_mentions}) result = {'queries': queries} return result
def load_index(model_name: str): import nmslib index = nmslib.init(method="hnsw", space="cosinesimil") index.loadIndex(filename=f"{base_path()}{model_name}/{index_file_name}") return index
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print('data.shape', data.shape) positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print('id %s != pos %s' % (id, pos)) sys.exit(1) end = time.time() print('added data in %s secs' % (end - start)) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) print("Distance between points (0,0) " + str(nmslib.getDistance(index, 0, 0))); print("Distance between points (1,1) " + str(nmslib.getDistance(index, 1, 1))); print("Distance between points (0,1) " + str(nmslib.getDistance(index, 0, 1))); print("Distance between points (1,0) " + str(nmslib.getDistance(index, 1, 0))); for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print(idx, v) else: for idx, data in enumerate(read_data('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) end = time.time() print('querying done in %s secs' % (end - start)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
else: for i, d in enumerate(tqdm(corpus)): vectors.append(d['vector']) M = args.M efC = args.ef num_threads = args.threads index_time_params = { 'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0 } if args.is_sparse: index = nmslib.init(method='hnsw', space='negdotprod_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) else: index = nmslib.init(method='hnsw', space='negdotprod', data_type=nmslib.DataType.DENSE_VECTOR) index.addDataPointBatch(vectors) start = time.time() index.createIndex(index_time_params, print_progress=True) end = time.time() index_time = end - start print('Index-time parameters', index_time_params) print('Indexing time = %f' % index_time) index.saveIndex(os.path.join(args.hnsw_index, 'index.bin'), save_data=True) metadata = copy.deepcopy(index_time_params)
def test_string_fresh(batch=True): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print('DATA_STRS', DATA_STRS) positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) print("Distance between points (0,0) " + str(nmslib.getDistance(index, 0, 0))); print("Distance between points (1,1) " + str(nmslib.getDistance(index, 1, 1))); print("Distance between points (0,1) " + str(nmslib.getDistance(index, 0, 1))); print("Distance between points (1,0) " + str(nmslib.getDistance(index, 1, 0))); for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print(idx, data, res, [DATA_STRS[i] for i in res]) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print 'data.shape', data.shape positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) end = time.time() print 'added data in %s secs' % (end - start) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print idx, v else: for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) end = time.time() print 'querying done in %s secs' % (end - start) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
searchTimes = [] reacll = [] k = 100 avgdistances = [] MMAXparam = [] dgraphParam = [] constructionClocks = [] searchClocks = [] clockAlg = [] import nmslib for example in [(dgraph, MMAX) for dgraph in [0, 1, 2, 3] for MMAX in [2, 4, 6, 8, 10, 12]]: hnsw = nmslib.init(method='hnsw', space='l2') dgraph = example[0] MMAX = example[1] MMAXparam.append(example[1]) dgraphParam.append(example[0]) startClock = time.clock() startTime = process_time() hnsw.addDataPointBatch(train) hnsw.createIndex({'delaunay_type': dgraph, 'M': MMAX}) end_time = process_time() constructionTime = end_time - startTime endClock = time.clock() constructionClock = endClock - startClock
def _get_index(self, space='cosinesimil'): return nmslib.init(method='vptree', space=space)
def testGlobal(self): # this is a one line reproduction of https://github.com/nmslib/nmslib/issues/327 GlobalTestCase.index = nmslib.init()
def _create_vector_space(self, file_path): vector_data = self._read_tsv_file(file_path) vector_space = nmslib.init(method='hnsw', space='cosinesimil') vector_space.addDataPointBatch(vector_data) vector_space.createIndex({'post': 2}, print_progress=True) return vector_space
def build_advanced_index(self, vecs: 'np.ndarray'): import nmslib _index = nmslib.init(method=self.method, space=self.space) _index.addDataPointBatch(vecs.astype(np.float32)) _index.createIndex({'post': 2}, print_progress=self.print_progress) return _index
def _get_index(self, space='cosinesimil'): return nmslib.init(method='sw-graph', space=space)
def fit(self, x): self._index = nmslib.init(space=self._metric, method=self._method) self._index.addDataPointBatch(x) self._index.createIndex(index_params={'efConstruction': 500}, print_progress=True) self._index.setQueryTimeParams(params={'efSearch': 500})
def load(self): self.index = nmslib.init(method='hnsw', space='l2') self.index.loadIndex(self.cfg.faceidx_pkl) self.lookup_frame = pd.read_csv(self.cfg.framelookup_csv, index_col=0)
def _rebuild_index(self): self.index = nmslib.init(method="hnsw", space="cosinesimil") self.index.addDataPointBatch(data=self.embs[:self.current_capacity]) self.index.createIndex(print_progress=self.print_progress)
def build_ann_index(feature_vectors): print('\nBuilding nmslib index') index = nmslib.init(method='hnsw', space='cosinesimil') index.addDataPointBatch(feature_vectors) index.createIndex({'post': 2}, print_progress=True) return index
index_time_params = { 'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0 } print('Index-time parameters', index_time_params) # Number of neighbors K = 100 space_name = 'l2' # Intitialize the library, specify the space, the type of the vector and add data points index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) index.addDataPointBatch(features_data) # Create an index start = time.time() index_time_params = { 'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC } index.createIndex(index_time_params) end = time.time() print('Index-time parameters', index_time_params) print('Indexing time = %f' % (end - start))
def load(self, fn): self._index = nmslib.init(space=self._metric, method=self._method) self._index.loadIndex(fn) self._index.setQueryTimeParams(params={'efSearch': 500})
def bench_sparse_vector(batch=True): dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print 'dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0] k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i,:])]) res.sort(key=lambda x: x[1]) print 'q0 res', res[:k] data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = snn.MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print res[:5] for i in res[0]: print int(i), distance.cosine(q0, dataset[int(i),:]) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix) print 'positions', positions else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print idx, v if idx == 0: for i in v: print 'q0', i, distance.cosine(q0, dataset[i,:]) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print idx, res nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)