Exemplo n.º 1
1
def test_string_loaded():
    DATA_STRS = ["xyz", "beagcfa", "cea", "cb",
                 "d", "c", "bdaf", "ddcd",
                 "egbfa", "a", "fba", "bcccfe",
                 "ab", "bfgbfdc", "bcbbgf", "bfbb"
                 ]
    QUERY_STRS = ["abc", "def", "ghik"]
    space_type = 'leven'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'

    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.OBJECT_AS_STRING,
                             nmslib.DistType.INT)

    for id, data in enumerate(DATA_STRS):
        nmslib.addDataPoint(index, id, data)

    print 'Let\'s print a few data entries'
    print 'We have added %d data points' % nmslib.getDataPointQty(index)

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
        print nmslib.getDataPoint(index,i)

    print 'Let\'s invoke the index-build process'

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']


    nmslib.loadIndex(index, index_name)

    print "The index %s is loaded" % index_name

    nmslib.setQueryTimeParams(index, query_time_param)

    print 'Query time parameters are set'

    print "Results for the loaded index:"

    k = 2
    for idx, data in enumerate(QUERY_STRS):
        print idx, nmslib.knnQuery(index, k, data)

    nmslib.freeIndex(index)
Exemplo n.º 2
0
def test_vector_loaded():
    space_type = 'cosinesimil'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'
    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.DENSE_VECTOR,
                             nmslib.DistType.FLOAT)

    for id, data in enumerate(read_data('sample_dataset.txt')):
        pos = nmslib.addDataPoint(index, id, data)
	if id != pos:
            print 'id %s != pos %s' % (id, pos)
	    sys.exit(1)

    print 'Let\'s print a few data entries'
    print 'We have added %d data points' % nmslib.getDataPointQty(index)

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
       print nmslib.getDataPoint(index,i)

    print 'Let\'s invoke the index-build process'


    query_time_param = ['initSearchAttempts=3']

    nmslib.loadIndex(index, index_name)

    print "The index %s is loaded" % index_name

    nmslib.setQueryTimeParams(index,query_time_param)

    print 'Query time parameters are set'

    print "Results for the loaded index"

    k = 2
    for idx, data in enumerate(read_data('sample_queryset.txt')):
        print idx, nmslib.knnQuery(index, k, data)

    nmslib.freeIndex(index)
Exemplo n.º 3
0
    def fit(self, X):
        if self._method_name == 'vptree':
            # To avoid this issue:
            # terminate called after throwing an instance of 'std::runtime_error'
            # what():  The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
            # Aborted (core dumped)
            self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
                                        
        self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT)
    
        for i, x in enumerate(X):
            nmslib.addDataPoint(self._index, i, x.tolist())


        if os.path.exists(self._index_name):
            print "Loading index from file"
            nmslib.loadIndex(self._index, self._index_name)
        else:
            nmslib.createIndex(self._index, self._index_param)
            if self._save_index: 
              nmslib.saveIndex(self._index, self._index_name)

        nmslib.setQueryTimeParams(self._index, self._query_param)
Exemplo n.º 4
0
    def fit(self, X):
        if self._method_name == 'vptree':
            # To avoid this issue:
            # terminate called after throwing an instance of 'std::runtime_error'
            # what():  The data_old size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
            # Aborted (core dumped)
            self._index_param.append('bucketSize=%d' %
                                     min(int(X.shape[0] * 0.0005), 1000))

        self._index = nmslib.init('l2', [], self._method_name,
                                  nmslib.DataType.DENSE_VECTOR,
                                  nmslib.DistType.FLOAT)

        for i, x in enumerate(X):
            nmslib.addDataPoint(self._index, i, x.tolist())

        if os.path.exists(self._index_name):
            print('Loading index from file')
            nmslib.loadIndex(self._index, self._index_name)
        else:
            nmslib.createIndex(self._index, self._index_param)
            nmslib.saveIndex(self._index, self._index_name)

        nmslib.setQueryTimeParams(self._index, self._query_param)
Exemplo n.º 5
0
 def load(self, fname):
     nmslib.loadIndex(self.index, fname)
Exemplo n.º 6
0
num_lbls = int(sys.argv[4])
efS = int(sys.argv[5])
num_nbrs = int(sys.argv[6])
write_dist = int(sys.argv[7])
out_dir = sys.argv[8]
num_thread = int(sys.argv[9])
num_out_threads = int(sys.argv[10])
metric_space = sys.argv[11]
lbl_ft_file = sys.argv[12]

index = nmslib.init(method='hnsw',
                    space='cosinesimil_sparse',
                    data_type=nmslib.DataType.SPARSE_VECTOR)
data = data_utils.read_sparse_file(lbl_ft_file)
index.addDataPointBatch(data)
nmslib.loadIndex(index, model_file)

index.setQueryTimeParams({'efSearch': efS, 'algoType': 'old'})

start = time.time()
query = data_utils.read_sparse_file(tst_ft_file)
end = time.time()
start = time.time()
nbrs = index.knnQueryBatch(query, k=num_nbrs, num_threads=num_thread)
end = time.time()
print('Time taken to find approx nearest neighbors = %f' % (end - start))

batch_size = int(math.ceil(float(len(nbrs)) / float(num_out_threads)))
for i in range(num_out_threads):
    Process(target=write_knn_out,
            args=(out_dir, write_dist, num_lbls,