def test_string_loaded(): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the loaded index:" k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the loaded index" k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) if os.path.exists(self._index_name): print "Loading index from file" nmslib.loadIndex(self._index, self._index_name) else: nmslib.createIndex(self._index, self._index_param) if self._save_index: nmslib.saveIndex(self._index, self._index_name) nmslib.setQueryTimeParams(self._index, self._query_param)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data_old size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init('l2', [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) if os.path.exists(self._index_name): print('Loading index from file') nmslib.loadIndex(self._index, self._index_name) else: nmslib.createIndex(self._index, self._index_param) nmslib.saveIndex(self._index, self._index_name) nmslib.setQueryTimeParams(self._index, self._query_param)
def load(self, fname): nmslib.loadIndex(self.index, fname)
num_lbls = int(sys.argv[4]) efS = int(sys.argv[5]) num_nbrs = int(sys.argv[6]) write_dist = int(sys.argv[7]) out_dir = sys.argv[8] num_thread = int(sys.argv[9]) num_out_threads = int(sys.argv[10]) metric_space = sys.argv[11] lbl_ft_file = sys.argv[12] index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) data = data_utils.read_sparse_file(lbl_ft_file) index.addDataPointBatch(data) nmslib.loadIndex(index, model_file) index.setQueryTimeParams({'efSearch': efS, 'algoType': 'old'}) start = time.time() query = data_utils.read_sparse_file(tst_ft_file) end = time.time() start = time.time() nbrs = index.knnQueryBatch(query, k=num_nbrs, num_threads=num_thread) end = time.time() print('Time taken to find approx nearest neighbors = %f' % (end - start)) batch_size = int(math.ceil(float(len(nbrs)) / float(num_out_threads))) for i in range(num_out_threads): Process(target=write_knn_out, args=(out_dir, write_dist, num_lbls,