Пример #1
0
    def __init__(self, cpu=None, verbose=False):
        self.verbose = verbose
        self.device = "cuda" if torch.cuda.is_available(
        ) and not cpu else "cpu"
        if verbose:
            print("Using device:", self.device)
        TOKENIZER_URL = "https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz"
        MODEL_URLS = {
            "cuda": "https://openaipublic.azureedge.net/clip/models/" +
            "40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
            "cpu": "https://battle.shawwn.com/sdb/models/ViT-B-32-cpu.pt",
        }
        db_path = Path.home() / ".config/hypertag/"
        clip_files_path = db_path / "CLIP-files"
        os.makedirs(clip_files_path, exist_ok=True)
        tokenizer_name = TOKENIZER_URL.split("/")[-1]
        if not Path(clip_files_path / tokenizer_name).is_file():
            print("Downloading tokenizer...")
            download_url(TOKENIZER_URL, clip_files_path / tokenizer_name)
        model_name = self.device + "-model.pt"
        if not Path(clip_files_path / model_name).is_file():
            print(f"Downloading CLIP {self.device.upper()} model...")
            download_url(MODEL_URLS[self.device], clip_files_path / model_name)

        self.model = torch.jit.load(str(clip_files_path / model_name),
                                    map_location=self.device)
        self.model = (self.model.eval() if torch.cuda.is_available()
                      and not cpu else self.model.float().eval())
        self.tokenizer = SimpleTokenizer(bpe_path=str(clip_files_path /
                                                      tokenizer_name))
        input_resolution = self.model.input_resolution.item()
        self.preprocess = Compose([
            Resize(input_resolution, interpolation=Image.BICUBIC),
            CenterCrop(input_resolution),
            ToTensor(),
        ])

        self.image_mean = torch.tensor([0.48145466, 0.4578275,
                                        0.40821073]).to(self.device)
        self.image_std = torch.tensor([0.26862954, 0.26130258,
                                       0.27577711]).to(self.device)

        # Build or load index
        corpus_vectors, corpus_paths = self.get_image_corpus()
        index_dir = Path.home() / ".config/hypertag/index-files/"
        self.index_path = index_dir / "images.index"
        os.makedirs(index_dir, exist_ok=True)
        self.index = hnswlib.Index(space="cosine", dim=512)

        if self.index_path.exists():
            if self.verbose:
                print("Loading image index...")
            self.index.load_index(str(self.index_path),
                                  max_elements=len(corpus_vectors))
            self.update_index()
        else:
            # Create the HNSWLIB index
            if not corpus_vectors:
                return
            if self.verbose:
                print("Creating HNSWLIB image index...")
            self.index.init_index(max_elements=len(corpus_vectors),
                                  ef_construction=400,
                                  M=64)
            # Train the index to find a suitable clustering
            self.index.add_items(corpus_vectors,
                                 list(range(len(corpus_vectors))))
            if self.verbose:
                print("Saving index to:", self.index_path)
            self.index.save_index(str(self.index_path))
            # Update DB (set files as indexed)
            with Persistor() as db:
                db.set_indexed_by_file_paths(corpus_paths)
        # Controlling the recall by setting ef (lower is faster but more inaccuare)
        self.index.set_ef(50)  # ef should always be > top_k_hits
Пример #2
0
import json
if not os.path.isfile(ann_index):

    definitions = terms["training"].to_list()

    embeddings = model.encode(definitions,
                              show_progress_bar=False,
                              normalize_embeddings=True)
    embeddings.shape

    p = ann(embeddings)

    p.save_index(ann_index)

e_idx = hnswlib.Index(space=hnsw_distance, dim=768)
e_idx.load_index(ann_index)

#query="transmission electron microscope TEM"
#embeddings = model.encode(query,
#                show_progress_bar=True,
#                normalize_embeddings=True)

##labels,distances =  e_idx.knn_query(embeddings, k=20)
#for label, distance in zip(labels[0],distances[0]):
#    print(distance,tmp.iloc[label]["Class ID"],"\t",tmp.iloc[label]["Preferred Label"],"\t",tmp.iloc[label]["Definitions"])

params = pd.read_csv(os.path.join(folder_output, "params.txt"),
                     sep="\t",
                     encoding="utf-8")
prms = params["title"].unique()
Пример #3
0
def play_annotated_video():

    # Video
    cv2.namedWindow("preview")
    vc = cv2.VideoCapture('./media/cows.mp4')

    # CNN
    l2_net = L2Net("L2Net-HP+", True)

    # KNN
    p = hnswlib.Index(space='l2', dim=256)
    p.load_index("./data/index.bin")

    # DB
    uri = "neo4j://localhost:7687"
    driver = GraphDatabase.driver(uri, auth=("neo4j", "password"))

    group_id = 0
    group_id_dict = {}

    while vc.isOpened():

        rval, frame = vc.read()
        if rval == False:
            break

        frame = resize_frame(frame, window_size, stride, steps)

        # print(frame.shape)

        windows = np.empty((steps[0] * steps[1], window_size, window_size, 1))

        for x in range(steps[0]):
            for y in range(steps[1]):
                w = x * steps[0] + y
                #print((stride*y), (stride*y+window_size), (stride*x), (stride*x+window_size))
                windows[w] = frame[(stride * x):(stride * x + window_size),
                                   (stride * y):(stride * y + window_size)]
                #cv2.imwrite('./output/testing'+str(w)+'.jpg', windows[w])

        # extract cnn features from windows
        feats = l2_net.calc_descriptors(windows)

        labels, distances = p.knn_query(feats, k=1)

        frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)

        for x in range(steps[0]):
            for y in range(steps[1]):
                label = labels[x * steps[0] + y][0]

                # get category of observation with label
                with driver.session() as session:
                    g = session.read_transaction(get_observation_group, label)
                    if g not in group_id_dict:
                        group_id_dict[g] = group_id
                        group_id += 1
                    g = group_id_dict[g]

                print("observation_group", g)

                if g >= len(colors):
                    c = colors[len(colors) - 1]
                else:
                    c = colors[g]

                cv2.circle(frame, (stride * y + round(window_size / 2),
                                   stride * x + round(window_size / 2)), 3, c,
                           cv2.FILLED)

        cv2.imshow("preview", frame)

        key = cv2.waitKey(0)

        if key == 27:  # exit on ESC
            break

    vc.release()
    cv2.destroyWindow("preview")
Пример #4
0
def build_graph():

    print("Starting...")

    #initialize CNN and KNN index
    l2_net = L2Net("L2Net-HP+", True)

    #initialize KNN index
    p = hnswlib.Index(space='l2', dim=256)
    p.init_index(max_elements=50000, ef_construction=100, M=16)
    p.set_ef(10)

    #initialize graph database
    uri = "neo4j://localhost:7687"
    driver = GraphDatabase.driver(uri, auth=("neo4j", "password"))

    total_frame_count = 0

    # for each run though the video
    for r in range(runs):

        print("Run", r)

        # open video file for a run though
        cap = cv2.VideoCapture('./media/cows.mp4')

        # select a random starting position
        pos = (random.randint(0,
                              steps[0] - 1), random.randint(0, steps[1] - 1))

        done = False

        last_label = None
        last_labels = None
        last_distances = None

        run_frame_count = 0

        # for each batch
        for t in range(max_batches):
            if done:
                break

            print("Batch", t)

            windows = np.empty((frame_batch_size, window_size, window_size, 1))
            positions = []
            ids = []
            batch_frame_count = 0

            # read frames from video and walk window
            for b in range(frame_batch_size):
                ret, frame = cap.read()

                if ret == False:
                    done = True
                    break

                print("pos", pos)

                print("frame.shape", frame.shape)
                frame = resize_frame(frame, window_size, stride, steps)
                print("frame.shape", frame.shape)

                windows[b] = frame[(stride * pos[0]):(stride * pos[0] +
                                                      window_size),
                                   (stride * pos[1]):(stride * pos[1] +
                                                      window_size)]

                cv2.imwrite(
                    './output/testing' + str(total_frame_count) + '.jpg',
                    windows[b])

                positions.append(pos)

                t = run_frame_count - batch_frame_count + b
                ids.append(window_id(t, pos[0], pos[1]))

                total_frame_count += 1
                batch_frame_count += 1
                run_frame_count += 1

                pos = move(pos)

            # if no frames were read break
            if batch_frame_count == 0:
                break

            # if batch is short resize windows array to match
            if batch_frame_count != frame_batch_size:
                windows = windows[0:batch_frame_count]

            # extract cnn features from windows
            feats = l2_net.calc_descriptors(windows)
            print("feats.shape", feats.shape)

            for b in range(batch_frame_count):

                id = ids[b]

                t = run_frame_count - batch_frame_count + b
                y = positions[b][0]
                x = positions[b][1]

                # print(t,y,x,id)

                with driver.session() as session:
                    session.write_transaction(insert_observation, id, t, y, x,
                                              feats_to_json(feats[b]))

            if p.get_current_count() >= knn:

                labels, distances = p.knn_query(feats, k=knn)

                for b in range(batch_frame_count):

                    current_label = ids[b]

                    if b == 0:
                        if last_labels is None or last_distances is None:
                            last_label = current_label
                            continue
                        l = last_labels[last_labels.shape[0] - 1]
                        d = last_distances[last_labels.shape[0] - 1]
                    else:
                        l = labels[b - 1]
                        d = distances[b - 1]

                    print("--", last_label, current_label)

                    with driver.session() as session:
                        session.write_transaction(insert_adjacency, last_label,
                                                  current_label, 0.0)

                    for n in range(knn):
                        label = l[n]
                        distance = d[n]

                        if distance <= distance_threshold:

                            print("distance", distance)

                            with driver.session() as session:
                                session.write_transaction(
                                    insert_adjacency, label, current_label,
                                    distance)

                    last_label = current_label

                last_labels = labels
                last_distances = distances

            p.add_items(feats, ids)

        cap.release()
        cv2.destroyAllWindows()

    p.save_index("./data/index.bin")

    driver.close()
    print("Done")
Пример #5
0
import hnswlib
import numpy as np

dim = 128
num_elements = 10000

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))
ids = np.uint64([i for i in range(18446744073709551616-num_elements, 18446744073709551616)])

# data_labels = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip

# Initing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 200, M = 16)

# Element insertion (can be called several times):
p.add_items(data, ids)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

# Query dataset, k - number of closest elements (returns 2 numpy arrays)
labels, distances = p.knn_query(data, k = 1)

print(labels, distances)
Пример #6
0
import numpy as np
import hnswlib

from fastapi import APIRouter
from pydantic import BaseModel
from typing import List

router = APIRouter()

data = np.random.normal(loc=0.0, scale=1.0, size=(1000, 512))

item_count = len(data)
data_labels = np.arange(item_count)

graph = hnswlib.Index(space='cosine', dim=512)
graph.init_index(max_elements=len(data) * 2, ef_construction=200, M=16)
graph.add_items(data, data_labels)
graph.set_ef(50)


class QueryResponse(BaseModel):
    time_passed: float
    labels: List[int]
    distances: List[float]


class AddResponse(BaseModel):
    time_passed: float
    item_count: int
Пример #7
0
    def testRandomSelf(self):
        for idx in range(16):
            print("\n**** Index save-load test ****\n")

            np.random.seed(idx)
            dim = 16
            num_elements = 10000

            # Generating sample data
            data = np.float32(np.random.random((num_elements, dim)))

            # Declaring index
            p = hnswlib.Index(space='l2',
                              dim=dim)  # possible options are l2, cosine or ip

            # Initing index
            # max_elements - the maximum number of elements, should be known beforehand
            #     (probably will be made optional in the future)
            #
            # ef_construction - controls index search speed/build speed tradeoff
            # M - is tightly connected with internal dimensionality of the data
            #     stronlgy affects the memory consumption

            p.init_index(max_elements=num_elements, ef_construction=100, M=16)

            # Controlling the recall by setting ef:
            # higher ef leads to better accuracy, but slower search
            p.set_ef(100)

            p.set_num_threads(4)  # by default using all available cores

            # We split the data in two batches:
            data1 = data[:num_elements // 2]
            data2 = data[num_elements // 2:]

            print("Adding first batch of %d elements" % (len(data1)))
            p.add_items(data1)

            # Query the elements for themselves and measure recall:
            labels, distances = p.knn_query(data1, k=1)

            items = p.get_items(labels)

            # Check the recall:
            self.assertAlmostEqual(
                np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)

            # Check that the returned element data is correct:
            diff_with_gt_labels = np.mean(np.abs(data1 - items))
            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)

            # Serializing and deleting the index.
            # We need the part to check that serialization is working properly.

            index_path = 'first_half.bin'
            print("Saving index to '%s'" % index_path)
            p.save_index(index_path)
            print("Saved. Deleting...")
            del p
            print("Deleted")

            print("\n**** Mark delete test ****\n")
            # Reiniting, loading the index
            print("Reiniting")
            p = hnswlib.Index(space='l2', dim=dim)

            print("\nLoading index from '%s'\n" % index_path)
            p.load_index(index_path)
            p.set_ef(100)

            print("Adding the second batch of %d elements" % (len(data2)))
            p.add_items(data2)

            # Query the elements for themselves and measure recall:
            labels, distances = p.knn_query(data, k=1)
            items = p.get_items(labels)

            # Check the recall:
            self.assertAlmostEqual(
                np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)

            # Check that the returned element data is correct:
            diff_with_gt_labels = np.mean(np.abs(data - items))
            self.assertAlmostEqual(diff_with_gt_labels, 0,
                                   delta=1e-4)  # deleting index.

            # Checking that all labels are returned correctly:
            sorted_labels = sorted(p.get_ids_list())
            self.assertEqual(
                np.sum(~np.asarray(sorted_labels) == np.asarray(
                    range(num_elements))), 0)

            # Delete data1
            labels1, _ = p.knn_query(data1, k=1)

            for l in labels1:
                p.mark_deleted(l[0])
            labels2, _ = p.knn_query(data2, k=1)
            items = p.get_items(labels2)
            diff_with_gt_labels = np.mean(np.abs(data2 - items))
            self.assertAlmostEqual(diff_with_gt_labels, 0,
                                   delta=1e-3)  # console

            labels1_after, _ = p.knn_query(data1, k=1)
            for la in labels1_after:
                for lb in labels1:
                    if la[0] == lb[0]:
                        self.assertTrue(False)
            print("All the data in data1 are removed")

            # checking saving/loading index with elements marked as deleted
            del_index_path = "with_deleted.bin"
            p.save_index(del_index_path)
            p = hnswlib.Index(space='l2', dim=dim)
            p.load_index(del_index_path)
            p.set_ef(100)

            labels1_after, _ = p.knn_query(data1, k=1)
            for la in labels1_after:
                for lb in labels1:
                    if la[0] == lb[0]:
                        self.assertTrue(False)

        os.remove(index_path)
        os.remove(del_index_path)
Пример #8
0
    def dedupe(self, args):
        if not self.load_hashcache():
            self.dump_hashcache()

        # check num_proc
        if args.num_proc is None:
            num_proc = max(cpu_count() - 1, 1)
        else:
            num_proc = args.num_proc

        # Use NGT by default
        if (not self.hnsw) and (not self.faiss_flat):
            try:
                import ngtpy
            except:
                logger.error(
                    colored(
                        "Error: Unable to load NGT. Please install NGT and python binding first.",
                        'red'))
                sys.exit(1)
            index_path = self.get_ngt_index_path()
            logger.warning(
                "Building NGT index (dimension={}, num_proc={})".format(
                    self.hash_bits, num_proc))
            ngtpy.create(path=index_path.encode(),
                         dimension=self.hash_bits,
                         edge_size_for_creation=args.ngt_edges,
                         edge_size_for_search=args.ngt_edges_for_search,
                         object_type="Byte",
                         distance_type="Hamming")
            ngt_index = ngtpy.Index(index_path.encode())
            ngt_index.batch_insert(self.hashcache.hshs(), num_proc)

            # NGT Approximate neighbor search
            logger.warning("Approximate neighbor searching using NGT")
            hshs = self.hashcache.hshs()
            filenames = self.hashcache.filenames()
            check_list = [0] * len(hshs)
            current_group_num = 1
            if not args.query:
                for i in tqdm(range(len(hshs))):
                    new_group_found = False
                    if check_list[i] != 0:
                        # already grouped image
                        continue
                    for res in ngt_index.search(hshs[i],
                                                size=args.ngt_k,
                                                epsilon=args.ngt_epsilon):
                        if res[0] == i:
                            continue
                        else:
                            if res[1] <= self.hamming_distance:
                                if check_list[res[0]] == 0:
                                    if check_list[i] == 0:
                                        # new group
                                        new_group_found = True
                                        check_list[i] = current_group_num
                                        check_list[res[0]] = current_group_num
                                        self.group[current_group_num] = [
                                            filenames[i]
                                        ]
                                        self.group[current_group_num].extend(
                                            [filenames[res[0]]])
                                    else:
                                        # exists group
                                        exists_group_num = check_list[i]
                                        check_list[res[0]] = exists_group_num
                                        self.group[exists_group_num].extend(
                                            [filenames[res[0]]])
                    if new_group_found:
                        current_group_num += 1
            else:  # query image
                new_group_found = False
                hsh = self.hashcache.gen_hash(args.query)
                self.group[current_group_num] = []
                for res in ngt_index.search(hsh,
                                            size=args.ngt_k,
                                            epsilon=args.ngt_epsilon):
                    if res[1] <= self.hamming_distance:
                        new_group_found = True
                        self.group[current_group_num].extend(
                            [filenames[res[0]]])
                if new_group_found:
                    current_group_num += 1

            # remove ngt index
            if index_path:
                os.system("rm -rf {}".format(index_path))

        elif self.hnsw:
            try:
                import hnswlib
            except:
                logger.error(
                    colored(
                        "Error: Unable to load hnsw. Please install hnsw python binding first.",
                        'red'))
                sys.exit(1)
            hshs = self.hashcache.hshs()
            filenames = self.hashcache.filenames()
            num_elements = len(hshs)
            hshs_labels = np.arange(num_elements)
            hnsw_index = hnswlib.Index(space='l2',
                                       dim=self.hash_bits)  # Squared L2
            hnsw_index.init_index(max_elements=num_elements,
                                  ef_construction=args.hnsw_ef_construction,
                                  M=args.hnsw_m)
            hnsw_index.set_ef(max(args.hnsw_ef,
                                  args.hnsw_k - 1))  # ef should always be > k
            hnsw_index.set_num_threads(num_proc)
            logger.warning(
                "Building hnsw index (dimension={}, num_proc={})".format(
                    self.hash_bits, num_proc))
            hnsw_index.add_items(hshs, hshs_labels, num_proc)

            # hnsw Approximate neighbor search
            logger.warning("Approximate neighbor searching using hnsw")
            check_list = [0] * num_elements
            current_group_num = 1
            if not args.query:
                for i in tqdm(range(num_elements)):
                    new_group_found = False
                    if check_list[i] != 0:
                        # already grouped image
                        continue
                    labels, distances = hnsw_index.knn_query(
                        hshs[i], k=args.hnsw_k, num_threads=num_proc)
                    for label, distance in zip(labels[0], distances[0]):
                        if label == i:
                            continue
                        else:
                            if distance <= self.hamming_distance:
                                if check_list[label] == 0:
                                    if check_list[i] == 0:
                                        # new group
                                        new_group_found = True
                                        check_list[i] = current_group_num
                                        check_list[label] = current_group_num
                                        self.group[current_group_num] = [
                                            filenames[i]
                                        ]
                                        self.group[current_group_num].extend(
                                            [filenames[label]])
                                    else:
                                        # exists group
                                        exists_group_num = check_list[i]
                                        check_list[label] = exists_group_num
                                        self.group[exists_group_num].extend(
                                            [filenames[label]])
                    if new_group_found:
                        current_group_num += 1
            else:  # query image
                new_group_found = False
                hsh = self.hashcache.gen_hash(args.query)
                self.group[current_group_num] = []
                labels, distances = hnsw_index.knn_query(hsh,
                                                         k=args.hnsw_k,
                                                         num_threads=num_proc)
                for label, distance in zip(labels[0], distances[0]):
                    if distance <= self.hamming_distance:
                        new_group_found = True
                        self.group[current_group_num].extend(
                            [filenames[label]])
                if new_group_found:
                    current_group_num += 1

        elif self.faiss_flat:
            try:
                import faiss
            except:
                logger.error(
                    colored(
                        "Error: Unable to load faiss. Please install faiss python binding first.",
                        'red'))
                sys.exit(1)
            hshs = self.hashcache.hshs()
            filenames = self.hashcache.filenames()
            faiss.omp_set_num_threads(num_proc)
            logger.warning(
                "Building faiss index (dimension={}, num_proc={})".format(
                    self.hash_bits, num_proc))
            data = np.array(hshs).astype('float32')
            faiss_flat_index = faiss.IndexFlatL2(
                self.hash_bits)  # Exact search
            faiss_flat_index.add(data)

            # faiss Exact neighbor search
            logger.warning("Exact neighbor searching using faiss")
            check_list = [0] * faiss_flat_index.ntotal
            current_group_num = 1
            if not args.query:
                for i in tqdm(range(faiss_flat_index.ntotal)):
                    new_group_found = False
                    if check_list[i] != 0:
                        # already grouped image
                        continue
                    distances, labels = faiss_flat_index.search(
                        data[[i]], args.faiss_flat_k)
                    for label, distance in zip(labels[0], distances[0]):
                        if label == i:
                            continue
                        else:
                            if distance <= self.hamming_distance:
                                if check_list[label] == 0:
                                    if check_list[i] == 0:
                                        # new group
                                        new_group_found = True
                                        check_list[i] = current_group_num
                                        check_list[label] = current_group_num
                                        self.group[current_group_num] = [
                                            filenames[i]
                                        ]
                                        self.group[current_group_num].extend(
                                            [filenames[label]])
                                    else:
                                        # exists group
                                        exists_group_num = check_list[i]
                                        check_list[label] = exists_group_num
                                        self.group[exists_group_num].extend(
                                            [filenames[label]])
                    if new_group_found:
                        current_group_num += 1
            else:  # query image
                new_group_found = False
                hsh = np.array([self.hashcache.gen_hash(args.query)
                                ]).astype('float32')
                self.group[current_group_num] = []
                distances, labels = faiss_flat_index.search(
                    hsh, args.faiss_flat_k)
                for label, distance in zip(labels[0], distances[0]):
                    if distance <= self.hamming_distance:
                        new_group_found = True
                        self.group[current_group_num].extend(
                            [filenames[label]])
                if new_group_found:
                    current_group_num += 1

        # sort self.group
        if self.sort != 'none':
            self.sort_group()

        # write duplicate log file
        self.num_duplicate_set = current_group_num - 1
        if self.num_duplicate_set > 0 and args.log:
            now = datetime.now().strftime('%Y%m%d%H%M%S')
            duplicate_log_file = "{}_{}".format(now,
                                                self.get_duplicate_log_name())
            with open(duplicate_log_file, 'w') as f:
                if args.query:
                    f.write("Query: {}\n\n".format(args.query))
                for k in range(1, self.num_duplicate_set + 1):
                    img_list = self.group[k]
                    pad = 1 if args.query else 0
                    if len(img_list) + pad > 1:
                        sorted_img_list, _, _, _ = self.sort_image_list(
                            img_list)
                        if args.sameline:
                            f.write(" ".join(sorted_img_list) + "\n")
                        else:
                            f.write("\n".join(sorted_img_list) + "\n")
                            if k != len(self.group):
                                f.write("\n")
Пример #9
0
import hnswlib
import numpy as np

dim = 2000
num_elements = 1000

session = 'piano'

data = np.loadtxt(f"../mfcc_extract/{session}_mfcc.txt", dtype=float)
data_labels = np.arange(len(data))
#data_titles = [ t.strip()[:-8] if t.strip()[-8:] == '_320kbps' else t.strip() for t in open("../mfcc_extract/titles.txt", "r")]

p = hnswlib.Index(space='cosine', dim=dim)
p.init_index(max_elements=num_elements, ef_construction=200, M=16)
p.add_items(data, data_labels)
p.set_ef(50)  # ef should always be > k
print(len(data))
# labels, distances = p.knn_query(data[:10], k=5)
# for i, nn in enumerate(labels):
#     print(f"{data_titles[i]} : ")
#     for n in nn: print(data_titles[n])

index_path = f'{session}_hnsw.bin'
print("Saving index to '%s'" % index_path)
p.save_index(index_path)
del p
Пример #10
0
def load_index(index_path, dim):
    p = hnswlib.Index(space='cosine', dim=dim)
    print("\nLoading index from '%s'" % index_path)
    p.load_index(index_path)
    p.set_ef(50)
    return p
Пример #11
0
    model_name = 'distilbert-multilingual-nli-stsb-quora-ranking'
    model = LanguageTransformer(model_name)

    url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
    dataset_path = "quora_duplicate_questions.tsv"
    max_corpus_size = 100000

    embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format(
        model_name.replace('/', '_'), max_corpus_size)

    embedding_size = 768  #Size of embeddings
    top_k_hits = 10  #Output k hits

    #Defining our hnswlib index
    #We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity
    index = hnswlib.Index(space='cosine', dim=embedding_size)

    #Check if embedding cache path exists
    if not os.path.exists(embedding_cache_path):
        # Check if the dataset exists. If not, download and extract
        # Download dataset if needed
        if not os.path.exists(dataset_path):
            print("Download dataset")
            util.http_get(url, dataset_path)

        # Get all unique sentences from the file
        corpus_sentences = set()
        with open(dataset_path, encoding='utf8') as fIn:
            reader = csv.DictReader(fIn,
                                    delimiter='\t',
                                    quoting=csv.QUOTE_MINIMAL)
Пример #12
0
    def testRandomSelf(self):
        for idx in range(16):
            print("\n**** Index resize test ****\n")

            np.random.seed(idx)
            dim = 16
            num_elements = 10000

            # Generating sample data
            data = np.float32(np.random.random((num_elements, dim)))

            # Declaring index
            p = hnswlib.Index(space='l2',
                              dim=dim)  # possible options are l2, cosine or ip

            # Initiating index
            # max_elements - the maximum number of elements, should be known beforehand
            #     (probably will be made optional in the future)
            #
            # ef_construction - controls index search speed/build speed tradeoff
            # M - is tightly connected with internal dimensionality of the data
            #     strongly affects the memory consumption

            p.init_index(max_elements=num_elements // 2,
                         ef_construction=100,
                         M=16)

            # Controlling the recall by setting ef:
            # higher ef leads to better accuracy, but slower search
            p.set_ef(20)

            p.set_num_threads(idx % 8)  # by default using all available cores

            # We split the data in two batches:
            data1 = data[:num_elements // 2]
            data2 = data[num_elements // 2:]

            print("Adding first batch of %d elements" % (len(data1)))
            p.add_items(data1)

            # Query the elements for themselves and measure recall:
            labels, distances = p.knn_query(data1, k=1)

            items = p.get_items(list(range(len(data1))))

            # Check the recall:
            self.assertAlmostEqual(
                np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)

            # Check that the returned element data is correct:
            diff_with_gt_labels = np.max(np.abs(data1 - items))
            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)

            print("Resizing the index")
            p.resize_index(num_elements)

            print("Adding the second batch of %d elements" % (len(data2)))
            p.add_items(data2)

            # Query the elements for themselves and measure recall:
            labels, distances = p.knn_query(data, k=1)
            items = p.get_items(list(range(num_elements)))

            # Check the recall:
            self.assertAlmostEqual(
                np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)

            # Check that the returned element data is correct:
            diff_with_gt_labels = np.max(np.abs(data - items))
            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)

            # Checking that all labels are returned correctly:
            sorted_labels = sorted(p.get_ids_list())
            self.assertEqual(
                np.sum(~np.asarray(sorted_labels) == np.asarray(
                    range(num_elements))), 0)
    scene_dirs = [
        path.join(collection_dir, d, "image") for d in listdir(collection_dir)
        if path.isdir(path.join(collection_dir, d))
    ]
    image_paths = [[path.join(d, e) for e in listdir(d)][0]
                   for d in scene_dirs]
    return [(p[len(sun_rgbd_directory) + 1:], p) for p in image_paths]


images = list_image_paths(path.join(
    sun_rgbd_directory, "kv2", "kinect2data")) + list_image_paths(
        path.join(sun_rgbd_directory, "kv2", "align_kv2"))

database = PatchGraphDatabase()

desc_index = hnswlib.Index(space='l2', dim=descriptor_size)
desc_index.init_index(max_elements=7000000, ef_construction=200, M=16)
desc_index.set_ef(50)


def extract_features_to_db(image_path, image_name, scene_node, size):
    patches = extract_image_features(image_path, image_name, size)

    insert_patches_result = database.insert_patches(patches)
    print("inserted patch nodes", len(insert_patches_result))

    data = np.array([p['des'] for p in insert_patches_result],
                    dtype=np.float32)
    data_labels = np.array([p['id'] for p in insert_patches_result])

    desc_index.add_items(data, data_labels)