Exemplo n.º 1
1
    def test_dist(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        self.assertAlmostEqual(i.get_distance(0, 1), 1.0)
Exemplo n.º 2
1
 def test_metric_kwarg(self):
     # Issue 211
     i = AnnoyIndex(2, metric='euclidean')
     i.add_item(0, [1, 0])
     i.add_item(1, [9, 0])
     self.assertAlmostEqual(i.get_distance(0, 1), 8)
     self.assertEqual(i.f, 2)
Exemplo n.º 3
1
class ImageSearchAnnoyCombo:
    '''
    load an Annoy index for approximate nearest neighbor computation
    Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v))
    '''
    def __init__(self,h5fname = 'X_ILSVRC2015.hdf5',annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt',dset = 'fc6fc7'):
        #load h5 data
        h5f = h5py.File(h5fname,'r')
        self.X = h5f[dset]
        #load filenames
        with open(imageListPath,'r') as f:
            self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)}
        self.A = AnnoyIndex(self.X.shape[1],'angular')
        self.A.load(annf)

    def run_query_approx(self,query,n=100,accuracy_factor = 5):
        nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True)
        return zip((self.line_to_file[i] for i in nearest),scores)

    def run_query_exact(self,query,n=1000,nsmall=100):
        #retrieve approximate nearest neighbors using Annoy, then do exact ranking by loading from h5 into memory
        #use Annoy
        if n < nsmall:
            n = nsmall
        indexes = self.A.get_nns_by_vector(query, n, search_k=-1, include_distances=False)
        indexes_sorted = sorted(indexes)
        #use scipy cdist (or normalize first and do dot product for faster computation)
        #getting X by index from disc is very slow. 
        distance = (cdist(self.X[indexes_sorted], query.reshape((1,query.shape[0])), 'cosine'))[:,0]
        ind = np.argpartition(distance, nsmall)[:nsmall]#partial sort, indices for top n,
        s_ind = np.argsort(distance[ind])#sort 
        nearest = ind[s_ind]
        scoresorted = distance[ind][s_ind]
        return zip((self.line_to_file[indexes_sorted[i]] for i in nearest),scoresorted)
Exemplo n.º 4
0
    def test_tuple(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        for j in xrange(n_points):
            i.add_item(j, (random.gauss(0, 1) for x in xrange(f)))

        i.build(n_trees)
Exemplo n.º 5
0
def do(indextype):
    a = AnnoyIndex(8, indextype[0])
    a.load('points.%s.annoy' % indextype)
    with open('points.%s.ann.txt' % indextype, 'w') as out:
        for q_index in [1443, 1240, 818, 1725, 1290, 2031, 1117, 1211, 1902, 603]:
            nns = a.get_nns_by_item(q_index, 10)
            print >> out, '%s\t%s' % (q_index, ','.join([str(n) for n in nns]))
Exemplo n.º 6
0
    def test_dist(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        self.assertAlmostEqual(i.get_distance(0, 1), (2 * (1.0 - 2 ** -0.5))**0.5)
Exemplo n.º 7
0
    def _get_index(self, f, distance):
        input = 'test/glove.twitter.27B.%dd.txt.gz' % f
        output = 'test/glove.%d.%s.annoy' % (f, distance)
        
        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                # Hosting them on my own S3 bucket since the original files changed format
                url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f
                print('downloading', url, '->', input)
                urlretrieve(url, input)

            print('building index', distance, f)
            annoy =  AnnoyIndex(f, 12, "test_db", 10,  1000, 3048576000, 0)
            v_v = []
            items = []
            for i, line in enumerate(gzip.open(input, 'rb')):
                v = [float(x) for x in line.strip().split()[1:]]
                v_v.append(v)
                items.append(i)
                if (i+1) % 10000 == 0:
                    print (i+1)
                    annoy.add_item_batch(items, v_v)
                    v_v = []
                    items = []
            if v_v:
                annoy.add_item_batch(items, v_v)
        return annoy
Exemplo n.º 8
0
def ANN(searchSpace):
    dimension = searchSpace[0].shape[0]
    t = AnnoyIndex(dimension, metric='euclidean')
    for i in range(len(searchSpace)):
        t.add_item(i, searchSpace[i])
    t.build(10)
    return t
Exemplo n.º 9
0
    def test_dist_degen(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [1, 0])
        i.add_item(1, [0, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 2.0**0.5)
Exemplo n.º 10
0
    def retrieve(self):

        print 'Loading necessary files..'
        u = AnnoyIndex(self.dim, metric='angular')
        u.load(index_file)

        print 'ANN Retrieval..'
        for n_neighbors in knns:
            print 'Number of neighbors: ' + str(n_neighbors)
            for mult in self.multipliers:
                print 'Multiplier: ' + str(mult)
                search_k = self.n_trees * n_neighbors * mult
                filename = '.'.join((self.test_file.split('/')[-1].split('.')[:-1]))
                with open(self.test_file, 'r') as data_file:
                    data = json.load(data_file)
                    qArray = []
                    for i in range(len(data["questions"])):
                        question_body = data["questions"][i]["body"]
                        question_id = data["questions"][i]["id"]
                        qcentroid = np.transpose(np.array(get_centroid_idf(question_body, self.emb, self.idf, self.stopwords, self.dim)))

                        anns = u.get_nns_by_vector(qcentroid, n_neighbors, search_k)
                        doc_anns = []
                        for n in anns:
                            doc_anns.append(self.idmap[n])
                        q = Question(question_body, question_id, doc_anns)
                        qArray.append(q)
                    directory = "system_results/"
                    if not os.path.exists(directory):
                        os.makedirs(directory)
                    with open(str(directory)+"/"+"CentIDF_annoy_"+str(n_trees)+"_"+str(n_neighbors)+"_"+str(mult)+".json", "w+") as outfile:
                        outfile.write(json.dumps({"questions":[ob.__dict__ for ob in qArray]}, indent=2))
Exemplo n.º 11
0
 def test_no_items(self):
     idx = AnnoyIndex(100)
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 0)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
Exemplo n.º 12
0
 def test_save_without_build(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)])
     i.save('x.tree')
     j = AnnoyIndex(10)
     j.load('x.tree')
     j.build(10)
Exemplo n.º 13
0
def fit_annoy(data, n_trees=-1):
    logger.info('Fitting Annoy Matcher...')
    from annoy import AnnoyIndex
    matcher = AnnoyIndex(data.shape[1], metric='euclidean')
    for i, d in enumerate(data):
        matcher.add_item(i, d)
    matcher.build(n_trees)
    return matcher
Exemplo n.º 14
0
 def test_get_item_vector(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     i.add_item(0, [random.gauss(0, 1) for x in xrange(f)])
     for j in xrange(100):
         print(j, '...')
         for k in xrange(1000 * 1000):
             i.get_item_vector(0)
def build_index(counts,label_to_id,dimension):
    index = AnnoyIndex(dimension,metric='angular')
    for label,cnt_list in counts.items():
        id = label_to_id[label]
        index.add_item(id,cnt_list)

    index.build(100)
    return index
Exemplo n.º 16
0
    def test_wrong_length(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [random.gauss(0, 1) for x in xrange(f)])
        self.assertRaises(IndexError, i.add_item, 1, [random.gauss(0, 1) for x in xrange(f+1000)])
        self.assertRaises(IndexError, i.add_item, 2, [])

        i.build(n_trees)
Exemplo n.º 17
0
    def __init__(self, fn_word, model_name, model_path):
        self.model = QueryModel(fn_word, model_name, model_path)
        self.queries = []
        self.titles = []

        self.query_index = 0
        self.title_index = 0
        self.query_ann = AnnoyIndex(self.model.dim, metric='euclidean')
        self.title_ann = AnnoyIndex(self.model.dim, metric='euclidean')
Exemplo n.º 18
0
def create_index_tree(clusters):
    features = clusters.shape[1]
    tree = AnnoyIndex(features, metric='euclidean')

    for i, v in enumerate(clusters):
        tree.add_item(i, v.tolist())

    tree.build(features*2)
    return tree
Exemplo n.º 19
0
    def _build_from_model(self, vectors, labels, num_features):
        index = AnnoyIndex(num_features)

        for vector_num, vector in enumerate(vectors):
            index.add_item(vector_num, vector)

        index.build(self.num_trees)
        self.index = index
        self.labels = labels
class FeatureNN:
    tree = None

    def __init__(self, features, tree_file):
        self.tree = AnnoyIndex(features, metric='euclidean')
        self.tree.load(str(tree_file))

    def nn(self, x):
        return self.tree.get_nns_by_vector(x.tolist(), 1)[0]
Exemplo n.º 21
0
    def test_numpy(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        for j in xrange(n_points):
            a = numpy.random.normal(size=f)
            a = a.astype(random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16]))
            i.add_item(j, a)

        i.build(n_trees)
Exemplo n.º 22
0
    def test_dist_degen(self):
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 2
        i = AnnoyIndex(f, 2, "test_db", 64,  1000, 3048576000, 0)
        
        i.add_item(0, [1, 0])
        i.add_item(1, [0, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 2.0)
Exemplo n.º 23
0
    def _get_index(self, dataset):
        url = 'http://vectors.erikbern.com/%s.hdf5' % dataset
        vectors_fn = os.path.join('test', dataset + '.hdf5')
        index_fn = os.path.join('test', dataset + '.annoy')

        if not os.path.exists(vectors_fn):
            print('downloading', url, '->', vectors_fn)
            urlretrieve(url, vectors_fn)

        dataset_f = h5py.File(vectors_fn)
        distance = dataset_f.attrs['distance']
        f = dataset_f['train'].shape[1]
        annoy = AnnoyIndex(f, distance)

        if not os.path.exists(index_fn):
            print('adding items', distance, f)
            for i, v in enumerate(dataset_f['train']):
                annoy.add_item(i, v)

            print('building index')
            annoy.build(10)
            annoy.save(index_fn)
        else:
            annoy.load(index_fn)
        return annoy, dataset_f
Exemplo n.º 24
0
    def test_write_failed(self):
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)

        if sys.platform == "linux" or sys.platform == "linux2":
            # linux
            try:
                t.save("/dev/full") 
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue(str(e).find("No space left on device") > 0)
        elif sys.platform == "darwin":
            volume = "FULLDISK"
            device = os.popen('hdiutil attach -nomount ram://64').read()
            os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device))
            os.popen('touch "/Volumes/%s/full"' % volume)
            try:
                t.save('/Volumes/%s/annoy.tree' % volume)
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue(str(e).find("No space left on device") > 0)
            finally:
                os.popen("hdiutil detach %s" % device)
Exemplo n.º 25
0
def precision(f=40, n=1000000):
    t = AnnoyIndex(f)
    for i in xrange(n):
        v = []
        for z in xrange(f):
            v.append(random.gauss(0, 1))
        t.add_item(i, v)

    t.build(2 * f)
    t.save('test.tree')

    limits = [10, 100, 1000, 10000]
    k = 10
    prec_sum = {}
    prec_n = 1000
    time_sum = {}

    for i in xrange(prec_n):
        j = random.randrange(0, n)
        print 'finding nbs for', j
        
        closest = set(t.get_nns_by_item(j, n)[:k])
        for limit in limits:
            t0 = time.time()
            toplist = t.get_nns_by_item(j, limit)
            T = time.time() - t0
            
            found = len(closest.intersection(toplist))
            hitrate = 1.0 * found / k
            prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
            time_sum[limit] = time_sum.get(limit, 0.0) + T

        for limit in limits:
            print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
Exemplo n.º 26
0
def build_annoy_index(corpus, dimension, winlen, winstep):
    print "Adding to Annoy index"
    index = AnnoyIndex(dimension, "euclidean")
    mfcc_list = []
    i = 0
    for filename, frames in corpus:
#        print filename, frames.shape
        for index_in_file, mfcc in enumerate(frames):
            mfcc_list.append((filename, index_in_file))
            index.add_item(i, mfcc.tolist())
            assert mfcc_list[i] == (filename, index_in_file)
            i += 1

    opts = {"samplerate": desired_samplerate,
            "winlen": winlen,
            "winstep": winstep,
            "numcep": 13,
            "nfilt": 26,
            "nfft": 512,
            "ntrees": ANN_NTREES
            }
    cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree"
    
    if not os.path.exists(cache_filename):
        print "Building Annoy index with %d trees" % ANN_NTREES
    #    index.build(-1)
        index.build(ANN_NTREES)
        index.save(cache_filename)
        print "\tWrote cache to %s" % cache_filename
    else:
        print "\tReading cache from %s" % cache_filename
        index.load(cache_filename)
    return index, mfcc_list
Exemplo n.º 27
0
 def test_range_errors(self, n_points=1000, n_trees=10):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(n_points):
         i.add_item(j, [random.gauss(0, 1) for x in xrange(f)])
     self.assertRaises(IndexError, i.add_item, -1, [random.gauss(0, 1) for x in xrange(f)])
     i.build(n_trees)
     for bad_index in [-1000, -1, n_points, n_points + 1000]:
         self.assertRaises(IndexError, i.get_distance, 0, bad_index)
         self.assertRaises(IndexError, i.get_nns_by_item, bad_index, 1)
         self.assertRaises(IndexError, i.get_item_vector, bad_index)
Exemplo n.º 28
0
    def test_threads(self):
        n, f = 10000, 10
        i = AnnoyIndex(f, 'euclidean')
        for j in xrange(n):
            i.add_item(j, numpy.random.normal(size=f))
        i.build(10)

        pool = multiprocessing.pool.ThreadPool()
        def query_f(j):
            i.get_nns_by_item(1, 1000)
        pool.map(query_f, range(n))
Exemplo n.º 29
0
    def test_dist(self):
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        
        f = 2   
        i = AnnoyIndex(f,  2, "test_db", 64,  1000, 3048576000, 0)
        # i.verbose(True)
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2 ** -0.5))
Exemplo n.º 30
0
 def test_get_nns_by_item_batch(self):
     print "test_get_nns_by_item_batch "
     os.system("rm -rf test_db")
     os.system("mkdir test_db")
     f = 3
     i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0)
     i.add_item_batch([0,1,2], [[2, 1, 0], [1, 2, 0], [0, 0, 1]])
    
     self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2])
     self.assertEqual(i.get_nns_by_item(1, 3), [1, 0, 2])
     self.assertTrue(i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]) # could be either
Exemplo n.º 31
0
# coding: utf-8
from annoy import AnnoyIndex
import json
import random
import redis

redis_client = redis.StrictRedis(host='localhost', port=6379, db=0)

illust_vector_list = []
dim = 512
t = AnnoyIndex(dim)  # Length of item vector that will be indexed
path = '../keras/pixiv-ranking-features.txt'

with open(path, 'r') as f:
    i = 0
    line = f.readline()
    while line:
        js = json.loads(line)
        print(js['illust_id'])
        illust_vector_list.append(js)
        if i > 1e3:
            break
        i = i + 1
        line = f.readline()

for i, illust_vector in enumerate(illust_vector_list):
    illust_id = illust_vector['illust_id']

    line2illustId = f"line2illustId_{i}"
    illustId2line = f"illustId2line_{illust_id}"
    print(f"{line2illustId} -> {illustId2line}")
def main():
    # load face features
    u = AnnoyIndex(512, metric="euclidean")
    u.load(config.FACE_FEATURES)

    video = args.video_path
    f = open(args.output_path, "w")

    fvs = FileVideoStream(video).start()
    time.sleep(1.0)
    fps = FPS().start()

    count = 0
    frame_count = 0
    while fvs.more():
        img = fvs.read()
        f.write("frame_{}".format(frame_count))
        img = cv2.resize(img,
                         None,
                         fx=config.RESIZE_IMAGE,
                         fy=config.RESIZE_IMAGE)
        bboxlist = detector.detect(img)
        for b in bboxlist:
            x1, y1, x2, y2, s = b
            if (s > config.DETECTION_THRESHOLD):
                x1 = int(x1)
                x2 = int(x2)
                y1 = int(y1)
                y2 = int(y2)
                width = x2 - x1
                height = y2 - y1

                if width >= config.MIN_SIZE and height >= config.MIN_SIZE:
                    ret = mtcnn_detector.detect_face(img[y1:y2, x1:x2],
                                                     det_type=1)
                    if ret is None:
                        continue
                    face_image = "frame_" + str(frame_count) + "_face_" + str(
                        count) + ".jpg"
                    count += 1
                    f.write(",object_{},position,{},{},{},{}".format(
                        count, x1, x2, y1, y2))
                    bbox, landmarks = ret
                    if landmarks is None:
                        continue

                    pointx = landmarks[0][:5]
                    pointy = landmarks[0][5:]
                    pointx_img_space = map(lambda x: x + x1, pointx)
                    pointy_img_space = map(lambda y: y + y1, pointy)
                    landmarks_img_space = list(pointx_img_space) + list(
                        pointy_img_space)
                    bbox_process = np.array([x1, y1, x2, y2])
                    landmarks_process = np.array(landmarks_img_space).reshape(
                        (2, 5)).T
                    nimg = preprocess(img,
                                      bbox_process,
                                      landmarks_process,
                                      image_size="112,112")

                    e = embedding.get_feature(nimg)
                    match = u.get_nns_by_vector(e,
                                                config.TOP_ACC_NUM,
                                                include_distances=True)

                    pose = headpose.get_pose(nimg)
                    cv2.imwrite(
                        os.path.join(FACE_IMAGE_PATH, "aligned_" + face_image),
                        nimg)
                    f.write(",pose,{},{},{}".format(pose[0], pose[1], pose[2]))

                    identified_face_top_1 = face_db_pd[(match[0][0])]
                    score_top_1 = match[1][0]
                    if score_top_1 < config.MATCH_THRESHOLD:
                        cv2.putText(img, identified_face_top_1, (x1, y1),
                                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0))
                        for i in range(config.TOP_ACC_NUM):
                            identified_face = face_db_pd[(match[0][i])]

                            if (abs(pose[0]) > config.POSE_THRESHOLD
                                    or abs(pose[1]) > config.POSE_THRESHOLD
                                    or abs(pose[2]) > config.POSE_THRESHOLD):
                                continue
                            if (match[1][i] < config.MATCH_THRESHOLD):
                                f.write(",{},{}".format(
                                    identified_face, match[1][i]))

                cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 1)

        frame_count += 1
        f.write("\n")
        cv2.imshow("img_window", img)
        fps.update()
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    fps.stop()
    cv2.destroyAllWindows()
    fvs.stop()
    f.close()
Exemplo n.º 33
0
def initialiseAnnoy(part_of_speech):
	# Loads Annoy ADJ vectors
	if part_of_speech == "adjective":
		adj_annoy = AnnoyIndex(50, metric='angular')
		adjectives = list()
		adj_lookup = dict()

		print("\n\tLoading adjective vectors...")
		for i, line in enumerate(open("./vectors/adjVectors", "r")):
			line = line.strip()
			word, vec_s = line.split("  ")
			vec = [float(n) for n in vec_s.split()]
			adj_annoy.add_item(i, vec)
			adj_lookup[word] = vec
			adjectives.append(word)
		adj_annoy.build(50)

		return adj_annoy, adjectives, adj_lookup

	# Loads Annoy VERB vectors
	elif part_of_speech == "verb":
		verb_annoy = AnnoyIndex(50, metric='angular')
		verbs = list()
		verb_lookup = dict()

		print("\n\tLoading verb vectors...")
		for i, line in enumerate(open("./vectors/verbVectors", "r")):
			line = line.strip()
			word, vec_s = line.split("  ")
			vec = [float(n) for n in vec_s.split()]
			verb_annoy.add_item(i, vec)
			verb_lookup[word] = vec
			verbs.append(word)
		verb_annoy.build(50)

		return verb_annoy, verbs, verb_lookup

	# Loads Annoy NOUN vectors
	elif part_of_speech == "noun":
		noun_annoy = AnnoyIndex(50, metric='angular')
		nouns = list()
		noun_lookup = dict()

		print("\n\tLoading noun vectors...")
		for i, line in enumerate(open("./vectors/nounVectors", "r")):
			line = line.strip()
			word, vec_s = line.split("  ")
			vec = [float(n) for n in vec_s.split()]
			noun_annoy.add_item(i, vec)
			noun_lookup[word] = vec
			nouns.append(word)
		noun_annoy.build(50)

		return noun_annoy, nouns, noun_lookup

	else:
		raise Exception("Part of speech must either be 'adjective', 'verb' or 'noun'.")
def main(project_name):

    tic = time.time()

    logger = Logger('_05_make_submission_1000_{}'.format(project_name))
    logger.info('=' * 50)

    model_path = '_model/embedding_model_{}.pt'.format(project_name)
    logger.info('load model from {}'.format(model_path))
    model = torch.load(model_path)
    model.eval()

    dir_target = '../../input/test'
    embedder = ImgEmbedder(model, dir_target)

    sample_submission = pd.read_csv('../../dataset/sample_submission.csv')

    images = list()

    with open(
            os.path.join('_embed_index',
                         'index_names_{}.json'.format(project_name)),
            'r') as f:
        index_names = json.load(f)

    test_id_list = sample_submission.id

    f = 512
    u = AnnoyIndex(f, metric='euclidean')
    u.load(
        os.path.join('_embed_index',
                     'index_features_{}.ann'.format(project_name)))

    logger.info('===> embed test images and get nearest neighbors')

    search_k = 1_000_000

    for test_id in tqdm(test_id_list):

        target_file = '{}.jpg'.format(test_id)

        try:
            img_feature = embedder.get_vector(target_file)
            indeces = u.get_nns_by_vector(img_feature.tolist(),
                                          n=1000,
                                          search_k=search_k)
        except:
            indeces = list(range(1000))

        names = [index_names[index] for index in indeces]

        images.append(' '.join(names))

    submission = pd.DataFrame(test_id_list, columns=['id'])
    submission['images'] = images

    output_path = '../../submission/submission_1000_{}.csv'.format(
        project_name)
    submission.to_csv(output_path, index=False)

    toc = time.time() - tic
    logger.info('Elapsed time: {:.1f} [min]'.format(toc / 60.0))
Exemplo n.º 35
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3)
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEquals(a.get_n_items(), 4)
     self.assertEquals(a.get_item_vector(3), [0, 0, 1])
     self.assertEquals(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
     a.save('something.annoy')
     self.assertEquals(a.get_n_items(), 4)
     self.assertEquals(a.get_item_vector(3), [0, 0, 1])
     self.assertEquals(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
Exemplo n.º 36
0
 def test_seed(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     i.set_seed(42)
Exemplo n.º 37
0
 def test_metric_f_kwargs(self):
     i = AnnoyIndex(f=3, metric='euclidean')
def get_subsampling_index2(data_process, standard_scale = True, cutoff_sig = 0.02, rate = 0.3, \
                           method = "pykdtree", verbose = 1, image_index = []):
    """
    Using Nearest-Neighbor search based algorithm, find the list of indices of the subsampled dataset
    
    
    Parameters
    -------------
    data_process: List. the list of datapoints, with selected features
    
    standard_scale [True]: Boolean. Whether to apply standard scaler to the dataset prior to subsampling
    
    cutoff_sig [0.02]: Float. cutoff significance. the cutoff distance equals to the Euclidean 
                       norm of the standard deviations in all dimensions of the data points 
    
    rate [0.3]: Float. possibility of deletion
    
    method ["pykdtree"]: String. which backend nearest neighbour model to use. 
                         possible choices: ["pykdtree", "nmslib", "sklearn", "scipy", "annoy", "flann"]
    
    verbose [1]: integer. level of verbosity
    
    
    Return
    -------------
    overall_keep_list: The list of indices of the final subsampled entries
    
    """

    if verbose >= 1:
        print("Started NN-subsampling, original length: {}".format(
            len(data_process)))

    method = method.lower()
    start = time.time()

    if method == "flann":
        if verbose >= 1:
            print("use flann backend")
    elif method == "pykdtree":
        if verbose >= 1:
            print("use pykdtree backend")
    elif method == "sklearn":
        if verbose >= 1:
            print("use slearn nearest neighbors backend")
    elif method == "scipy":
        if verbose >= 1:
            print("use scipy cKDTree backend")
    elif method == "annoy":
        if verbose >= 1:
            print("use annoy backend")
    elif method == "nmslib":
        if verbose >= 1:
            print("use nmslib backend")
    else:
        print("method {} not impletemented".format(method))
        raise NotImplemented

    # apply standard scaling
    if standard_scale:
        if verbose >= 2:
            print("Subample with standard scaled data")
        data_process = StandardScaler().fit_transform(
            np.asarray(data_process).copy())
    else:
        if verbose >= 2:
            print("Subample with original data")
        data_process = np.asarray(data_process).copy()

    #set cutoff distance
    list_of_descs = zip(*data_process)
    sum_std2 = 0.
    for descs in list_of_descs:
        temp_std = np.std(descs)
        sum_std2 += temp_std**2
    cutoff = cutoff_sig * np.sqrt(sum_std2)

    #initialize the index
    overall_keep_list = np.arange(len(data_process)).tolist()

    keep_going = True
    iter_count = 1
    while keep_going:
        if verbose >= 2:
            print('start iteration {}, total length: {}'.format(
                iter_count, len(overall_keep_list)))
        start_cycle = time.time()
        temp_data_process = get_array_based_on_index(data_process.copy(),
                                                     overall_keep_list)
        temp_image_index = get_array_based_on_index(image_index,
                                                    overall_keep_list)

        #build and query nearest neighbour model
        if method == "flann":
            flann = FLANN()
            indices, distances = flann.nn(temp_data_process,
                                          temp_data_process,
                                          2,
                                          algorithm="kmeans")
        elif method == "scipy":
            kd_tree = cKDTree(temp_data_process)
            distances, indices = kd_tree.query(temp_data_process, k=2)
        elif method == "pykdtree":
            kd_tree = KDTree(temp_data_process, leafsize=6)
            distances, indices = kd_tree.query(temp_data_process, k=2)
        elif method == "sklearn":
            nbrs = NearestNeighbors(n_neighbors=2,
                                    algorithm='kd_tree',
                                    n_jobs=-1).fit(temp_data_process)
            distances, indices = nbrs.kneighbors(temp_data_process)
        elif method == "annoy":
            annoy = AnnoyIndex(len(temp_data_process[0]), metric='euclidean')
            for i in range(len(temp_data_process)):
                annoy.add_item(i, temp_data_process[i])
            annoy.build(1)
            distances = []
            indices = []
            for i in range(len(temp_data_process)):
                temp_index, temp_dist = annoy.get_nns_by_vector(
                    temp_data_process[i], 2, include_distances=True)
                indices.append([i, temp_index[1]])
                distances.append([0.0, temp_dist[1]])
        elif method == "nmslib":
            index = nmslib.init(method='hnsw', space='l2')
            index.addDataPointBatch(temp_data_process)
            index.createIndex(print_progress=False)

            neighbours = index.knnQueryBatch(temp_data_process, k=2)

            distances = []
            indices = []
            for item in neighbours:
                indices.append(item[0])
                distances.append(item[1])

        else:
            raise NotImplemented

        # if distance between each point and its nearest neighbor is below cutoff distance,
        # add the nearest neighbout to the candidate removal list
        remove_index_li = []
        index_li = []

        for index, distance in zip(indices, distances):
            index_li.append(index[0])
            if distance[1] <= cutoff:
                remove_index_li.append(index[1])

        # randomly select datapoints in the candidate removal list (based on rate)
        # and form the final removal list of this iteration
        # stop the cycle if the final removal list is empty
        temp_num = int(ceil(float(len(remove_index_li)) * rate))

        if temp_num == 0:
            keep_going = False
        #remove_index_li = random_subsampling(remove_index_li,temp_num)
        remove_index_li = rank_subsampling(remove_index_li, temp_num,
                                           temp_image_index)

        temp_keep_list = remove_list_from_list(index_li, remove_index_li)
        overall_keep_list = [overall_keep_list[i] for i in temp_keep_list]
        try:
            if len(overall_keep_list) == old_overall_keep_list_len:
                keep_going = False
                print("stopped because length didn't change")
        except:
            pass
        if verbose >= 2:
            print('end iteration {}. length: {}\t time:{}'.format(
                iter_count, len(overall_keep_list),
                time.time() - start_cycle))
        iter_count += 1
        old_overall_keep_list_len = len(overall_keep_list)
    if verbose >= 1:
        print('end NN-subsampling. length: {}\t time:{}'.format(
            len(overall_keep_list),
            time.time() - start))
    return overall_keep_list
Exemplo n.º 39
0
 def test_save_twice(self):
     # Issue #100
     t = AnnoyIndex(10)
     t.save("t.ann")
     t.save("t.ann")
Exemplo n.º 40
0
 def test_construct_destruct(self):
     for x in range(100000):
         i = AnnoyIndex(10)
         i.add_item(1000, [random.gauss(0, 1) for z in range(10)])
Exemplo n.º 41
0
 def test_construct_load_destruct(self):
     for x in range(100000):
         i = AnnoyIndex(10)
         i.load('test/test.tree')
Exemplo n.º 42
0
 def test_load_unload(self):
     # Issue #108
     i = AnnoyIndex(10)
     for x in range(100000):
         i.load('test/test.tree')
         i.unload()
Exemplo n.º 43
0
from nltk import ngrams
import random, json, glob, os, codecs, random
import numpy as np

# data structures
file_index_to_file_name = {}
file_index_to_file_vector = {}

# config
dims = 2048
n_nearest_neighbors = 200
trees = 100
infiles = glob.glob('feature_vectors/*.npz')

# build ann index
t = AnnoyIndex(dims)
for file_index, i in enumerate(infiles):
    file_vector = np.loadtxt(i)
    file_name = os.path.basename(i).split('.')[0]
    print("file_name = %s" % file_name)
    file_index_to_file_name[file_index] = file_name
    file_index_to_file_vector[file_index] = file_vector
    t.add_item(file_index, file_vector)
t.build(trees)

# create a nearest neighbors json file for each input
if not os.path.exists('bran_neighbors'):
    os.makedirs('bran_neighbors')

# do the similarity search for the feature vectors
# this is a <filename*.npz> file
Exemplo n.º 44
0
def generate_triplets_from_ANN(model, sequences, entity2unique, entity2same, unique_text, test):
    predictions = model.predict(sequences)
    t = AnnoyIndex(len(predictions[0]), metric='euclidean')  # Length of item vector that will be indexed
    t.set_seed(123)
    for i in range(len(predictions)):
        # print(predictions[i])
        v = predictions[i]
        t.add_item(i, v)

    t.build(100) # 100 trees

    match = 0
    no_match = 0
    ann_accuracy = 0
    total = 0
    precise = 0
    
    triplets = {}
    closest_positive_counts = []
    
    pos_distances = []
    neg_distances = []
    all_pos_distances = []
    all_neg_distances = []

    triplets['anchor'] = []
    triplets['positive'] = []
    triplets['negative'] = []

    if test:
        NNlen = TEST_NEIGHBOR_LEN
    else:
        NNlen = TRAIN_NEIGHBOR_LEN

    for key in entity2same:
        index = entity2unique[key]
        nearest = t.get_nns_by_vector(predictions[index], NNlen)
        nearest_text = set([unique_text[i] for i in nearest])
        expected_text = set(entity2same[key])
            
        # annoy has this annoying habit of returning the queried item back as a nearest neighbor.  Remove it.
        if key in nearest_text:
            nearest_text.remove(key)
        # print("query={} names = {} true_match = {}".format(unique_text[index], nearest_text, expected_text))
        overlap = expected_text.intersection(nearest_text)
        # collect up some statistics on how well we did on the match
        m = len(overlap)
        match += m
        # since we asked for only x nearest neighbors, and we get at most x-1 neighbors that are not the same as key (!)
        # make sure we adjust our estimate of no match appropriately
        no_match += min(len(expected_text), NNlen - 1) - m

        # sample only the negatives that are true negatives
        # that is, they are not in the expected set - sampling only 'semi-hard negatives is not defined here'
        # positives = expected_text - nearest_text
        positives = overlap
        negatives = nearest_text - expected_text

        # print(key + str(expected_text) + str(nearest_text))
        for i in negatives:
            for j in positives:
                dist_pos = t.get_distance(index, entity2unique[j])
                pos_distances.append(dist_pos)
                dist_neg = t.get_distance(index, entity2unique[i])
                neg_distances.append(dist_neg)
                if dist_pos < dist_neg:
                    ann_accuracy += 1
                total += 1
                # print(key + "|" +  j + "|" + i)
                # print(dist_pos)
                # print(dist_neg)               

        min_neg_distance = 1000000        
        for i in negatives:
            dist_neg = t.get_distance(index, entity2unique[i])
            all_neg_distances.append(dist_neg)
            if dist_neg < min_neg_distance:
                    min_neg_distance = dist_neg

        for j in expected_text:
            dist_pos = t.get_distance(index, entity2unique[j])
            all_pos_distances.append(dist_pos)

        closest_pos_count = 0
        for p in overlap:
            dist_pos = t.get_distance(index, entity2unique[p])
            if dist_pos < min_neg_distance:
                closest_pos_count+=1

        if closest_pos_count > 0:
            precise+=1

        closest_positive_counts.append(closest_pos_count / min(len(expected_text), NNlen - 1))


            
        for i in negatives:
            for j in expected_text:
                triplets['anchor'].append(key)
                triplets['positive'].append(j)
                triplets['negative'].append(i)

    print("mean closest positive count:" + str(statistics.mean(closest_positive_counts)))
    print("mean positive distance:" + str(statistics.mean(pos_distances)))
    print("stdev positive distance:" + str(statistics.stdev(pos_distances)))
    print("max positive distance:" + str(max(pos_distances)))
    print("mean neg distance:" + str(statistics.mean(neg_distances)))
    print("stdev neg distance:" + str(statistics.stdev(neg_distances)))
    print("max neg distance:" + str(max(neg_distances)))
    print("mean all positive distance:" + str(statistics.mean(all_pos_distances)))
    print("stdev all positive distance:" + str(statistics.stdev(all_pos_distances)))
    print("max all positive distance:" + str(max(all_pos_distances)))
    print("mean all neg distance:" + str(statistics.mean(all_neg_distances)))
    print("stdev all neg distance:" + str(statistics.stdev(all_neg_distances)))
    print("max all neg distance:" + str(max(all_neg_distances)))
    print("Accuracy in the ANN for triplets that obey the distance func:" + str(ann_accuracy / total))
    print("Precision at 1: " +  str(precise / len(entity2same)))
    
    obj = {}
    obj['accuracy'] = ann_accuracy / total
    obj['steps'] = 1
    with open(output_file_name_for_hpo, 'w', encoding='utf8') as out:
        json.dump(obj, out)

    if test:
        return match/(match + no_match)
    else:
        return triplets, match/(match + no_match)
Exemplo n.º 45
0
        for p in classif['body']['predictions']:
            uri =  p['uri']
            rois = p['rois']
            sys.stdout.write('\rIndexing image '+str(d)+'/'+str(len(onlyfiles)) + ' : ' + str(len(rois)) + ' rois  total:' + str(c) + '   ')
            sys.stdout.flush()

            for roi in rois:
                bbox = roi['bbox']
                cat = roi['cat']
                prob = roi['prob']
                vals = roi['vals']
                if c == 0:
                    layer_size = len(vals)
                    s['layer_size'] = layer_size
                    t = AnnoyIndex(layer_size,metric) # prepare index
                t.add_item(c,vals)
                s[str(c)] = {'uri':uri, 'bbox' : bbox, 'cat' : cat, 'prob' : prob}
                c = c + 1
            d = d + 1
        #if c >= 10000:
        #    break
    print 'building index...\n'
    print 'layer_size=',layer_size
    t.build(ntrees)
    t.save('index.ann')
    s.close()

if args.search:
    s = shelve.open('data.bin')
    u = AnnoyIndex(s['layer_size'],metric)
Exemplo n.º 46
0
#!/bin/env python3
import sys, argparse
from annoy import AnnoyIndex
import re
import statistics

parser = argparse.ArgumentParser()
parser.add_argument('--file', help='Input file')
parser.add_argument('--out', help='Outfile base')
parser.add_argument('--L', help='Fingerprint length')
parser.add_argument('--norm', help='Normalize')
args = parser.parse_args()

a = AnnoyIndex(int(args.L))
i = 0
names = []

with open(args.file, 'r') as f:
    for line in f:
        id, statements, *v = line.split("\t")
        id = re.sub('.json.gz', '', id)
        id = re.sub('\.', '|', id)
        names.append(id)
        v = [float(j) for j in v]
        if args.norm:
            avg = statistics.mean(v)
            std = statistics.stdev(v)
            v = [(j - avg) / std for j in v]
        a.add_item(i, v)
        i = i + 1
Exemplo n.º 47
0
class FakeDetector:
    """
    #TODO: Complete the description
    """
    def __init__(self):
        """

        """
        self.tree = None
        self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        # self.embedder = EmbedSentence()

    def build(self,
              brand_names: str,
              n_tree: int = 100,
              embedding_size: int = 100):
        """
        #TODO: fill details of param
        :param brand_names:
        :param n_tree:
        :param embedding_size:
        :return:
        """
        brand_names_sentence = sentence_char2vec(brand_names)
        self.tree = AnnoyIndex(embedding_size, 'angular')

        for value, (_, _token) in enumerate(brand_names_sentence.items()):
            self.tree.add_item(value, _token)

        self.tree.build(n_tree)

    def fake_detector(
        self,
        text: str,
        embedding_size: int = 100,
        detection_range: Tuple = (0.97, 0.99)) -> bool:
        """
        #TODO: fill details of params
        :param text:
        :param embedding_size:
        :param detection_range:
        :return:
        """
        found_match = False
        text_sentence = sentence_char2vec(text)
        for _, (_, _token) in enumerate(text_sentence.items()):
            match = self.tree.get_nns_by_vector(_token, 1)[0]
            sim_score = round(
                float(
                    self.cos(
                        _token.view(-1, embedding_size),
                        torch.tensor(self.tree.get_item_vector(match)).view(
                            -1, embedding_size))), 2)

            if detection_range[0] <= sim_score <= detection_range[1]:
                found_match = True

            if found_match:
                break

        return found_match
# In[27]:

from annoy import AnnoyIndex

# In[28]:

# Choose a random image to experiment
random_image_index = random.randint(0, num_items)
# Note: the results may change if the image is changed

# First, we build a search index with two hyperparameters - the number of dimensions of the dataset, and the number of trees.

# In[29]:

annoy_index = AnnoyIndex(
    num_dimensions)  # Length of item vector that will be indexed
for i in range(num_items):
    annoy_index.add_item(i, dataset[i])
annoy_index.build(40)  #40 trees

# Now let’s find out the time it takes to search the 5 nearest neighbors of one image.

# In[30]:

#u = AnnoyIndex(num_dimensions)
#Time the search for one image for Annoy
get_ipython().run_line_magic(
    'timeit',
    'annoy_index.get_nns_by_vector(query, 5, include_distances=True )')

# Now THAT is blazing fast! To put this in perspective, for such a modestly sized dataset, this can serve almost 15000 requests on a single CPU core. Considering most CPUs have multiple cores, it should be able to handle 100K+ requests on a single system. The best part is that it lets you share the same index in memory between multiple processes. Hence, the biggest index can be equivalent to the size of your overall RAM, making it possible to serve multiple requests on a single system.
Exemplo n.º 49
0
 def test_load_save(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     u = i.get_item_vector(99)
     i.save('i.tree')
     v = i.get_item_vector(99)
     self.assertEqual(u, v)
     j = AnnoyIndex(10)
     j.load('test/test.tree')
     w = i.get_item_vector(99)
     self.assertEqual(u, w)
     # Ensure specifying if prefault is allowed does not impact result
     j.save('j.tree', True)
     k = AnnoyIndex(10)
     k.load('j.tree', True)
     x = k.get_item_vector(99)
     self.assertEqual(u, x)
     k.save('k.tree', False)
     l = AnnoyIndex(10)
     l.load('k.tree', False)
     y = l.get_item_vector(99)
     self.assertEqual(u, y)
Exemplo n.º 50
0
            vectorArr.append(vector)
    except Exception as e:
        print(per)
        print(e)

np.save('vectors', vectorArr)
np.save('persons', nameArr)

###########################################
##             ANNOY SIDE                ##
###########################################

file = np.load('vectors.npy')

f = 512
t = AnnoyIndex(
    f, metric="euclidean")  # Length of item vector that will be indexed
for i in range(len(file)):
    v = file[i]
    t.add_item(i, v)

t.build(10)  # 10 trees
t.save('test.ann')

u = AnnoyIndex(f, metric="euclidean")
u.load('test.ann')  # super fast, will just mmap the file

av = np.load('vectors.npy')

pr = np.load('persons.npy')

arr = u.get_nns_by_vector(av[0], 3, include_distances=True)
Exemplo n.º 51
0
 def test_load_save(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     u = i.get_item_vector(99)
     i.save('x.tree')
     v = i.get_item_vector(99)
     self.assertEqual(u, v)
     j = AnnoyIndex(10)
     j.load('test/test.tree')
     w = i.get_item_vector(99)
     self.assertEqual(u, w)
Exemplo n.º 52
0
class EntityType(object):
    """Convenience wrapper around Annoy.

    More generally a way to collect vectors within the same entity type and
    quickly find similar vectors.

    * Helps deal with non-contiguous ids through an id map.
    * Checks for 0 vectors before returning matches.
    """

    def __init__(self, nfactor, ntrees, metric='angular',
                 entity_type_id=None, entity_type=None):
        """Initialize EntityType."""
        # metadata
        self._nfactor = nfactor
        self._metric = metric
        # object is accessed using this id. e.g. 'user'
        self._entity_type = entity_type
        # data is loaded in using this id. This can be more compact than the
        # entity_type, depending on the data source
        self._entity_type_id = entity_type_id
        self._ntrees = ntrees

        # data
        self._ann_obj = AnnoyIndex(nfactor, metric)
        # maps entity id to internal representation of id
        self._ann_map = {}
        # maps internal representation of id to entity id
        self._ann_map_inv = {}
        self._nitems = 0

    def add_item(self, entity_id, factors):
        """Add item, populating id map."""
        if entity_id in self._ann_map:
            raise ValueError('Duplicate entity: type = {0}, id = {1}'.format(
                self._entity_type, entity_id))
        self._ann_obj.add_item(self._nitems, factors)
        self._ann_map[entity_id] = self._nitems
        self._nitems = self._nitems + 1

    def build(self, verbose=False):
        """Build annoy model, create invert dictionary for future lookups."""
        self._ann_obj.verbose(verbose)
        self._ann_obj.build(self._ntrees)
        # this is only necessary after build, so we'll create it here
        self._ann_map_inv = {v: k for k, v in self._ann_map.items()}

    def get_nns_by_vector(self, vec, n, search_k):
        """Get nearest neighbors from an input vector."""
        nns = self._ann_obj.get_nns_by_vector(vec, n, search_k)
        return [self._ann_map_inv[x] for x in nns]

    def get_item_vector(self, entity_id):
        """Get a vector for an entity."""
        if entity_id in self._ann_map:
            return self._ann_obj.get_item_vector(self._ann_map[entity_id])
        else:
            return []

    def __iter__(self):
        """Iterate over object, return (entity_id, vector) tuples."""
        return (EntityVector(
                    entity_id=entity_id,
                    vector=self.get_item_vector(entity_id)
                ) for entity_id in self._ann_map.keys())

    def get_nfactor(self):
        return self._nfactor

    def load(self, pkl, filepath):
        entity_type = pkl.get_entity_type(self._entity_type_id)
        self.__dict__ = entity_type.__dict__
        # initialize index
        self._ann_obj = AnnoyIndex(pkl.get_nfactor(), entity_type._metric)
        # mmap the file
        self._ann_obj.load(filepath)
Exemplo n.º 53
0
words_to_wvs = {}
import numpy as np
with open("wvs.txt") as rf:
    for line in rf:
        split_line = line.strip().split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            #print(word)
            vec = [float(i) for i in split_line[1].split(" ")]
            words.append(word)
            words_to_wvs[word] = vec
w1 = "pikachu"
w2 = 'kanto'
w3 = "sinnoh"
w4 = "pachirisu"
words.append("vec({}) - vec({}) + vec({}) =".format(w1, w2, w3))
words_to_wvs["vec({}) - vec({}) + vec({}) =".format(w1, w2, w3)] = list(
    np.asarray(words_to_wvs[w1]) - np.asarray(words_to_wvs[w2]) +
    np.asarray(words_to_wvs[w3]))
t = AnnoyIndex(100, 'angular')  # Length of item vector that will be indexed
for ct, i in enumerate(words):
    t.add_item(ct, words_to_wvs[i])
t.build(100)  # 10 trees

wwca = [w1, w2, w3, w4, "vec({}) - vec({}) + vec({}) =".format(w1, w2, w3)]
for word in wwca:
    print("10 NEAREST NEIGHBORS OF {}".format(word))
    nearest_neighbors = t.get_nns_by_item(words.index(word), 10)
    for nn in nearest_neighbors:
        print(words[nn])
Exemplo n.º 54
0
 def test_save_without_build(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.add_item(1000, [random.gauss(0, 1) for z in range(10)])
     i.save('x.tree')
     j = AnnoyIndex(10)
     j.load('x.tree')
     j.build(10)
Exemplo n.º 55
0
def main(_):
    parser = argparse.ArgumentParser(description='TransE.')
    parser.add_argument('--data',
                        dest='data_dir',
                        type=str,
                        help="Data folder",
                        default='./data/FB15k/')
    parser.add_argument('--lr',
                        dest='lr',
                        type=float,
                        help="Learning rate",
                        default=1e-2)
    parser.add_argument("--dim",
                        dest='dim',
                        type=int,
                        help="Embedding dimension",
                        default=256)
    parser.add_argument("--batch",
                        dest='batch',
                        type=int,
                        help="Batch size",
                        default=32)
    parser.add_argument("--worker",
                        dest='n_worker',
                        type=int,
                        help="Evaluation worker",
                        default=3)
    parser.add_argument("--generator",
                        dest='n_generator',
                        type=int,
                        help="Data generator",
                        default=10)
    parser.add_argument("--eval_batch",
                        dest="eval_batch",
                        type=int,
                        help="Evaluation batch size",
                        default=32)
    parser.add_argument("--save_dir",
                        dest='save_dir',
                        type=str,
                        help="Model path",
                        default='./transE')
    parser.add_argument("--load_model",
                        dest='load_model',
                        type=str,
                        help="Model file",
                        default="")
    parser.add_argument("--save_per",
                        dest='save_per',
                        type=int,
                        help="Save per x iteration",
                        default=1)
    parser.add_argument("--eval_per",
                        dest='eval_per',
                        type=int,
                        help="Evaluate every x iteration",
                        default=3)
    parser.add_argument("--max_iter",
                        dest='max_iter',
                        type=int,
                        help="Max iteration",
                        default=30)
    parser.add_argument("--summary_dir",
                        dest='summary_dir',
                        type=str,
                        help="summary directory",
                        default='./transE_summary/')
    parser.add_argument("--keep",
                        dest='drop_out',
                        type=float,
                        help="Keep prob (1.0 keep all, 0. drop all)",
                        default=0.5)
    parser.add_argument("--optimizer",
                        dest='optimizer',
                        type=str,
                        help="Optimizer",
                        default='gradient')
    parser.add_argument("--prefix",
                        dest='prefix',
                        type=str,
                        help="model_prefix",
                        default='DEFAULT')
    parser.add_argument("--loss_weight",
                        dest='loss_weight',
                        type=float,
                        help="Weight on parameter loss",
                        default=1e-2)
    parser.add_argument("--neg_weight",
                        dest='neg_weight',
                        type=float,
                        help="Sampling weight on negative examples",
                        default=0.5)
    parser.add_argument("--save_per_batch",
                        dest='save_per_batch',
                        type=int,
                        help='evaluate and save after every x batches',
                        default=1000)
    parser.add_argument(
        "--outfile_prefix",
        dest='outfile_prefix',
        type=str,
        help='The filename of output file is outfile_prefix.txt',
        default='test_output')
    parser.add_argument("--neg_sample",
                        dest='neg_sample',
                        type=int,
                        help='No. of neg. samples per (h,r) or (t,r) pair',
                        default=5)
    parser.add_argument(
        "--fanout_thresh",
        dest='fanout_thresh',
        type=int,
        help='threshold on fanout of entities to be considered',
        default=2)
    parser.add_argument('--annoy_n_trees',
                        dest='annoy_n_trees',
                        type=int,
                        help='builds a forest of n_trees trees',
                        default=10)
    parser.add_argument(
        '--annoy_search_k',
        dest='annoy_search_k',
        type=int,
        help='During the query it will inspect up to search_k nodes',
        default=-1)
    parser.add_argument('--eval_after',
                        dest='eval_after',
                        type=int,
                        help='Evaluate after this many no. of epochs',
                        default=4)

    args = parser.parse_args()

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    print(args)

    model = TransE(args.data_dir,
                   embed_dim=args.dim,
                   fanout_thresh=args.fanout_thresh,
                   eval_batch=args.eval_batch)

    train_pos_neg_list, \
    train_loss, train_op = train_ops(model, learning_rate=args.lr,
                                     optimizer_str=args.optimizer,
                                     regularizer_weight=args.loss_weight)

    get_embedding_op = embedding_ops(model)

    # test_input, test_head, test_tail = test_ops(model)
    f1 = open('%s/%s.txt' % (args.save_dir, args.outfile_prefix), 'w')

    with tf.Session() as session:
        tf.global_variables_initializer().run()

        all_var = tf.all_variables()
        print 'printing all', len(all_var), ' TF variables:'
        for var in all_var:
            print var.name, var.get_shape()

        saver = tf.train.Saver(restore_sequentially=True)

        iter_offset = 0

        if args.load_model is not None and os.path.exists(args.load_model):
            saver.restore(session, args.load_model)
            iter_offset = int(
                args.load_model.split('.')[-2].split('_')[-1]) + 1
            f1.write("Load model from %s, iteration %d restored.\n" %
                     (args.load_model, iter_offset))

        total_inst = model.n_train
        best_filtered_mean_rank = float("inf")

        f1.write("preparing training data...\n")
        nbatches_count = 0
        # training_data_list = []
        training_data_pos_neg_list = []

        for dat in model.raw_training_data(batch_size=args.batch):
            # raw_training_data_queue.put(dat)
            # training_data_list.append(dat)
            ps_list = data_generator_func(dat, model.tr_h, model.hr_t,
                                          model.n_entity, args.neg_sample,
                                          model.n_relation)
            assert ps_list is not None
            training_data_pos_neg_list.append(ps_list)
            nbatches_count += 1
        f1.write("training data prepared.\n")
        f1.write("No. of batches : %d\n" % nbatches_count)
        f1.close()

        start_time = timeit.default_timer()

        for n_iter in range(iter_offset, args.max_iter):
            accu_loss = 0.
            ninst = 0
            # f1.close()

            for batch_id in range(nbatches_count):
                f1 = open('%s/%s.txt' % (args.save_dir, args.outfile_prefix),
                          'a')

                pos_neg_list = training_data_pos_neg_list[batch_id]
                #print data_e
                l, _ = session.run([train_loss, train_op],
                                   {train_pos_neg_list: pos_neg_list})

                accu_loss += l
                ninst += len(pos_neg_list)

                # print('len(pos_neg_list) = %d\n' % len(pos_neg_list))

                if ninst % (5000) is not None:
                    f1.write('[%d sec](%d/%d) : %.2f -- loss : %.5f \n' %
                             (timeit.default_timer() - start_time, ninst,
                              total_inst, float(ninst) / total_inst, l))
                f1.close()

            f1 = open('%s/%s.txt' % (args.save_dir, args.outfile_prefix), 'a')
            f1.write("")
            f1.write("iter %d avg loss %.5f, time %.3f\n" %
                     (n_iter, accu_loss / ninst,
                      timeit.default_timer() - start_time))

            # if n_iter == args.max_iter - 1:
            #     save_path = saver.save(session,
            #                            os.path.join(args.save_dir,
            #                                         "TransE_" + str(args.prefix) + "_" + str(n_iter) + ".ckpt"))
            #     f1.write("Model saved at %s\n" % save_path)

            with tf.device('/cpu'):
                if n_iter > args.eval_after and (n_iter % args.eval_per == 0 or
                                                 n_iter == args.max_iter - 1):

                    t = AnnoyIndex(model.embed_dim, metric='euclidean')

                    ent_embedding, rel_embedding = session.run(
                        get_embedding_op, {train_pos_neg_list: pos_neg_list})
                    # sess = tf.InteractiveSession()
                    # with sess.as_default():
                    #     ent_embedding = model.ent_embeddings.eval()
                    print np.asarray(ent_embedding).shape
                    print np.asarray(rel_embedding).shape

                    # print ent_embedding[10,:]
                    # print rel_embedding[10,:]
                    print 'Index creation started'

                    for i in xrange(model.n_entity):
                        v = ent_embedding[i, :]
                        t.add_item(i, v)
                    t.build(args.annoy_n_trees)

                    print 'Index creation completed'

                    # n = int(0.0005 * model.n_entity)
                    n = 1000
                    # search_k = int(n * args.annoy_n_trees/100.0)
                    search_k = 1000

                    print 'No. of items = %d' % t.get_n_items()
                    print sum(t.get_item_vector(0))
                    print sum(ent_embedding[0, :])
                    assert sum(t.get_item_vector(0)) == sum(
                        ent_embedding[0, :])

                    if n_iter == args.max_iter - 1:
                        eval_dict = zip(
                            [model.validation_data, model.testing_data],
                            ['VALID', 'TEST'])
                    else:
                        eval_dict = zip([model.validation_data], ['VALID'])

                    for data_func, test_type in eval_dict:
                        accu_mean_rank_h = list()
                        accu_mean_rank_t = list()
                        accu_filtered_mean_rank_h = list()
                        accu_filtered_mean_rank_t = list()

                        evaluation_count = 0
                        evaluation_batch = []
                        batch_id = 0
                        for testing_data in data_func(
                                batch_size=args.eval_batch):
                            batch_id += 1
                            print 'test_type: %s, batch id: %d' % (test_type,
                                                                   batch_id)
                            head_ids = list()
                            tail_ids = list()

                            for i in xrange(testing_data.shape[0]):
                                # try:
                                # print (ent_embedding[testing_data[i,0],:] + rel_embedding[testing_data[i,2],:])
                                tail_ids.append(
                                    t.get_nns_by_vector(
                                        (ent_embedding[testing_data[i, 0], :] +
                                         rel_embedding[testing_data[i, 2], :]),
                                        n, search_k))
                                head_ids.append(
                                    t.get_nns_by_vector(
                                        (ent_embedding[testing_data[i, 1], :] -
                                         rel_embedding[testing_data[i, 2], :]),
                                        n, search_k))
                                # except:
                                #     print 'i = %d' % i
                                #     print 'testing_data[i,0] = %d' % testing_data[i,0]
                                #     print 'testing_data[i,1] = %d' % testing_data[i,1]
                                #     print 'testing_data[i,2] = %d' % testing_data[i,2]

                            # print head_ids
                            # print tail_ids
                            evaluation_batch.append(
                                (testing_data, head_ids, tail_ids))
                            evaluation_count += 1

                        while evaluation_count > 0:
                            evaluation_count -= 1

                            # (mrh, fmrh), (mrt, fmrt) = result_queue.get()
                            (mrh, fmrh), (mrt, fmrt) = worker_func(
                                evaluation_batch[evaluation_count - 1],
                                model.hr_t, model.tr_h)
                            accu_mean_rank_h += mrh
                            accu_mean_rank_t += mrt
                            accu_filtered_mean_rank_h += fmrh
                            accu_filtered_mean_rank_t += fmrt

                        f1.write(
                            "[%s] ITER %d [HEAD PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f\n"
                            % (test_type, n_iter, np.mean(accu_mean_rank_h),
                               np.mean(accu_filtered_mean_rank_h),
                               np.mean(
                                   np.asarray(accu_mean_rank_h, dtype=np.int32)
                                   < 10),
                               np.mean(
                                   np.asarray(accu_filtered_mean_rank_h,
                                              dtype=np.int32) < 10)))

                        f1.write(
                            "[%s] ITER %d [TAIL PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f\n"
                            % (test_type, n_iter, np.mean(accu_mean_rank_t),
                               np.mean(accu_filtered_mean_rank_t),
                               np.mean(
                                   np.asarray(accu_mean_rank_t, dtype=np.int32)
                                   < 10),
                               np.mean(
                                   np.asarray(accu_filtered_mean_rank_t,
                                              dtype=np.int32) < 10)))

                        if test_type == 'VALID':
                            filtered_mean_rank = (
                                np.mean(accu_filtered_mean_rank_t) +
                                np.mean(accu_mean_rank_h)) / 2.0
                            if filtered_mean_rank < best_filtered_mean_rank:
                                save_path = saver.save(
                                    session,
                                    os.path.join(
                                        args.save_dir,
                                        "TransE_" + str(args.prefix) + "_" +
                                        str(n_iter) + ".ckpt"))
                                f1.write("Model saved at %s\n" % save_path)
                                best_filtered_mean_rank = filtered_mean_rank
            f1.close()
Exemplo n.º 56
0
 def test_unbuild_with_loaded_tree(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     i.unbuild()
Exemplo n.º 57
0
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.models import Model
from annoy import AnnoyIndex

# img_dir_path = 'dataset/All/'
img_dir_path = 'dataDrivenArt/bin/data/images/'
annoy_model_path = 'model/x-fresh-flatten.ann'
# annoy_dim = 4096 # fc2
annoy_dim = 25088

base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input,
              outputs=base_model.get_layer('flatten').output)

annoy_model = AnnoyIndex(annoy_dim)

for i in range(1, 3988):
    # img_path = img_dir_path + str(i) + '.jpg'
    img_path = "C:\\Users\\santa\\Desktop\\Python\\x-fresh\\dataDrivenArt\\bin\\data\\images\\{0:04d}.jpg".format(
        i + 0)
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    flatten_features = model.predict(x)

    annoy_model.add_item(i, flatten_features[0])
    print(img_path, 'saved')
Exemplo n.º 58
0
                    definitions[definitions_index]['language'],
                }
                for resource in [
                        data_pipeline.TREE_LABEL, data_pipeline.GRAPH_LABEL,
                        data_pipeline.CODE_TOKENS_LABEL
                ]:
                    sample[resource] = loaded_sample[resource]
                samples.append(sample)
                definitions_index += 1
            data_file_code_representations = model.get_code_representations(
                samples)
            code_representations_all.extend(data_file_code_representations)

        print('len(code_representations_all)', len(code_representations_all))

        indices = AnnoyIndex(code_representations_all[0].shape[0], 'angular')
        for index, vector in tqdm(enumerate(code_representations_all)):
            if vector is not None:
                indices.add_item(index, vector)
        indices.build(200)
        print('Index is built')

        for query in queries:
            for idx, _ in zip(*query_model(query, model, indices, language)):
                predictions.append(
                    (query, language, definitions[idx]['identifier'],
                     definitions[idx]['url']))

    df = pd.DataFrame(predictions,
                      columns=['query', 'language', 'identifier', 'url'])
    df.to_csv(predictions_csv, index=False)
Exemplo n.º 59
0
 def test_not_found_tree(self):
     i = AnnoyIndex(10)
     self.assertRaises(IOError, i.load, 'nonexists.tree')
def load_annoy_tree(model_file_name, vector_dims):
    tree = AnnoyIndex(vector_dims)
    tree.load(model_file_name)
    return tree