def __init__(self, cpu=None, verbose=False): self.verbose = verbose self.device = "cuda" if torch.cuda.is_available( ) and not cpu else "cpu" if verbose: print("Using device:", self.device) TOKENIZER_URL = "https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz" MODEL_URLS = { "cuda": "https://openaipublic.azureedge.net/clip/models/" + "40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", "cpu": "https://battle.shawwn.com/sdb/models/ViT-B-32-cpu.pt", } db_path = Path.home() / ".config/hypertag/" clip_files_path = db_path / "CLIP-files" os.makedirs(clip_files_path, exist_ok=True) tokenizer_name = TOKENIZER_URL.split("/")[-1] if not Path(clip_files_path / tokenizer_name).is_file(): print("Downloading tokenizer...") download_url(TOKENIZER_URL, clip_files_path / tokenizer_name) model_name = self.device + "-model.pt" if not Path(clip_files_path / model_name).is_file(): print(f"Downloading CLIP {self.device.upper()} model...") download_url(MODEL_URLS[self.device], clip_files_path / model_name) self.model = torch.jit.load(str(clip_files_path / model_name), map_location=self.device) self.model = (self.model.eval() if torch.cuda.is_available() and not cpu else self.model.float().eval()) self.tokenizer = SimpleTokenizer(bpe_path=str(clip_files_path / tokenizer_name)) input_resolution = self.model.input_resolution.item() self.preprocess = Compose([ Resize(input_resolution, interpolation=Image.BICUBIC), CenterCrop(input_resolution), ToTensor(), ]) self.image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).to(self.device) self.image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).to(self.device) # Build or load index corpus_vectors, corpus_paths = self.get_image_corpus() index_dir = Path.home() / ".config/hypertag/index-files/" self.index_path = index_dir / "images.index" os.makedirs(index_dir, exist_ok=True) self.index = hnswlib.Index(space="cosine", dim=512) if self.index_path.exists(): if self.verbose: print("Loading image index...") self.index.load_index(str(self.index_path), max_elements=len(corpus_vectors)) self.update_index() else: # Create the HNSWLIB index if not corpus_vectors: return if self.verbose: print("Creating HNSWLIB image index...") self.index.init_index(max_elements=len(corpus_vectors), ef_construction=400, M=64) # Train the index to find a suitable clustering self.index.add_items(corpus_vectors, list(range(len(corpus_vectors)))) if self.verbose: print("Saving index to:", self.index_path) self.index.save_index(str(self.index_path)) # Update DB (set files as indexed) with Persistor() as db: db.set_indexed_by_file_paths(corpus_paths) # Controlling the recall by setting ef (lower is faster but more inaccuare) self.index.set_ef(50) # ef should always be > top_k_hits
import json if not os.path.isfile(ann_index): definitions = terms["training"].to_list() embeddings = model.encode(definitions, show_progress_bar=False, normalize_embeddings=True) embeddings.shape p = ann(embeddings) p.save_index(ann_index) e_idx = hnswlib.Index(space=hnsw_distance, dim=768) e_idx.load_index(ann_index) #query="transmission electron microscope TEM" #embeddings = model.encode(query, # show_progress_bar=True, # normalize_embeddings=True) ##labels,distances = e_idx.knn_query(embeddings, k=20) #for label, distance in zip(labels[0],distances[0]): # print(distance,tmp.iloc[label]["Class ID"],"\t",tmp.iloc[label]["Preferred Label"],"\t",tmp.iloc[label]["Definitions"]) params = pd.read_csv(os.path.join(folder_output, "params.txt"), sep="\t", encoding="utf-8") prms = params["title"].unique()
def play_annotated_video(): # Video cv2.namedWindow("preview") vc = cv2.VideoCapture('./media/cows.mp4') # CNN l2_net = L2Net("L2Net-HP+", True) # KNN p = hnswlib.Index(space='l2', dim=256) p.load_index("./data/index.bin") # DB uri = "neo4j://localhost:7687" driver = GraphDatabase.driver(uri, auth=("neo4j", "password")) group_id = 0 group_id_dict = {} while vc.isOpened(): rval, frame = vc.read() if rval == False: break frame = resize_frame(frame, window_size, stride, steps) # print(frame.shape) windows = np.empty((steps[0] * steps[1], window_size, window_size, 1)) for x in range(steps[0]): for y in range(steps[1]): w = x * steps[0] + y #print((stride*y), (stride*y+window_size), (stride*x), (stride*x+window_size)) windows[w] = frame[(stride * x):(stride * x + window_size), (stride * y):(stride * y + window_size)] #cv2.imwrite('./output/testing'+str(w)+'.jpg', windows[w]) # extract cnn features from windows feats = l2_net.calc_descriptors(windows) labels, distances = p.knn_query(feats, k=1) frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR) for x in range(steps[0]): for y in range(steps[1]): label = labels[x * steps[0] + y][0] # get category of observation with label with driver.session() as session: g = session.read_transaction(get_observation_group, label) if g not in group_id_dict: group_id_dict[g] = group_id group_id += 1 g = group_id_dict[g] print("observation_group", g) if g >= len(colors): c = colors[len(colors) - 1] else: c = colors[g] cv2.circle(frame, (stride * y + round(window_size / 2), stride * x + round(window_size / 2)), 3, c, cv2.FILLED) cv2.imshow("preview", frame) key = cv2.waitKey(0) if key == 27: # exit on ESC break vc.release() cv2.destroyWindow("preview")
def build_graph(): print("Starting...") #initialize CNN and KNN index l2_net = L2Net("L2Net-HP+", True) #initialize KNN index p = hnswlib.Index(space='l2', dim=256) p.init_index(max_elements=50000, ef_construction=100, M=16) p.set_ef(10) #initialize graph database uri = "neo4j://localhost:7687" driver = GraphDatabase.driver(uri, auth=("neo4j", "password")) total_frame_count = 0 # for each run though the video for r in range(runs): print("Run", r) # open video file for a run though cap = cv2.VideoCapture('./media/cows.mp4') # select a random starting position pos = (random.randint(0, steps[0] - 1), random.randint(0, steps[1] - 1)) done = False last_label = None last_labels = None last_distances = None run_frame_count = 0 # for each batch for t in range(max_batches): if done: break print("Batch", t) windows = np.empty((frame_batch_size, window_size, window_size, 1)) positions = [] ids = [] batch_frame_count = 0 # read frames from video and walk window for b in range(frame_batch_size): ret, frame = cap.read() if ret == False: done = True break print("pos", pos) print("frame.shape", frame.shape) frame = resize_frame(frame, window_size, stride, steps) print("frame.shape", frame.shape) windows[b] = frame[(stride * pos[0]):(stride * pos[0] + window_size), (stride * pos[1]):(stride * pos[1] + window_size)] cv2.imwrite( './output/testing' + str(total_frame_count) + '.jpg', windows[b]) positions.append(pos) t = run_frame_count - batch_frame_count + b ids.append(window_id(t, pos[0], pos[1])) total_frame_count += 1 batch_frame_count += 1 run_frame_count += 1 pos = move(pos) # if no frames were read break if batch_frame_count == 0: break # if batch is short resize windows array to match if batch_frame_count != frame_batch_size: windows = windows[0:batch_frame_count] # extract cnn features from windows feats = l2_net.calc_descriptors(windows) print("feats.shape", feats.shape) for b in range(batch_frame_count): id = ids[b] t = run_frame_count - batch_frame_count + b y = positions[b][0] x = positions[b][1] # print(t,y,x,id) with driver.session() as session: session.write_transaction(insert_observation, id, t, y, x, feats_to_json(feats[b])) if p.get_current_count() >= knn: labels, distances = p.knn_query(feats, k=knn) for b in range(batch_frame_count): current_label = ids[b] if b == 0: if last_labels is None or last_distances is None: last_label = current_label continue l = last_labels[last_labels.shape[0] - 1] d = last_distances[last_labels.shape[0] - 1] else: l = labels[b - 1] d = distances[b - 1] print("--", last_label, current_label) with driver.session() as session: session.write_transaction(insert_adjacency, last_label, current_label, 0.0) for n in range(knn): label = l[n] distance = d[n] if distance <= distance_threshold: print("distance", distance) with driver.session() as session: session.write_transaction( insert_adjacency, label, current_label, distance) last_label = current_label last_labels = labels last_distances = distances p.add_items(feats, ids) cap.release() cv2.destroyAllWindows() p.save_index("./data/index.bin") driver.close() print("Done")
import hnswlib import numpy as np dim = 128 num_elements = 10000 # Generating sample data data = np.float32(np.random.random((num_elements, dim))) ids = np.uint64([i for i in range(18446744073709551616-num_elements, 18446744073709551616)]) # data_labels = np.arange(num_elements) # Declaring index p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip # Initing index - the maximum number of elements should be known beforehand p.init_index(max_elements = num_elements, ef_construction = 200, M = 16) # Element insertion (can be called several times): p.add_items(data, ids) # Controlling the recall by setting ef: p.set_ef(50) # ef should always be > k # Query dataset, k - number of closest elements (returns 2 numpy arrays) labels, distances = p.knn_query(data, k = 1) print(labels, distances)
import numpy as np import hnswlib from fastapi import APIRouter from pydantic import BaseModel from typing import List router = APIRouter() data = np.random.normal(loc=0.0, scale=1.0, size=(1000, 512)) item_count = len(data) data_labels = np.arange(item_count) graph = hnswlib.Index(space='cosine', dim=512) graph.init_index(max_elements=len(data) * 2, ef_construction=200, M=16) graph.add_items(data, data_labels) graph.set_ef(50) class QueryResponse(BaseModel): time_passed: float labels: List[int] distances: List[float] class AddResponse(BaseModel): time_passed: float item_count: int
def testRandomSelf(self): for idx in range(16): print("\n**** Index save-load test ****\n") np.random.seed(idx) dim = 16 num_elements = 10000 # Generating sample data data = np.float32(np.random.random((num_elements, dim))) # Declaring index p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip # Initing index # max_elements - the maximum number of elements, should be known beforehand # (probably will be made optional in the future) # # ef_construction - controls index search speed/build speed tradeoff # M - is tightly connected with internal dimensionality of the data # stronlgy affects the memory consumption p.init_index(max_elements=num_elements, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search p.set_ef(100) p.set_num_threads(4) # by default using all available cores # We split the data in two batches: data1 = data[:num_elements // 2] data2 = data[num_elements // 2:] print("Adding first batch of %d elements" % (len(data1))) p.add_items(data1) # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) items = p.get_items(labels) # Check the recall: self.assertAlmostEqual( np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) # Check that the returned element data is correct: diff_with_gt_labels = np.mean(np.abs(data1 - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # Serializing and deleting the index. # We need the part to check that serialization is working properly. index_path = 'first_half.bin' print("Saving index to '%s'" % index_path) p.save_index(index_path) print("Saved. Deleting...") del p print("Deleted") print("\n**** Mark delete test ****\n") # Reiniting, loading the index print("Reiniting") p = hnswlib.Index(space='l2', dim=dim) print("\nLoading index from '%s'\n" % index_path) p.load_index(index_path) p.set_ef(100) print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) items = p.get_items(labels) # Check the recall: self.assertAlmostEqual( np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) # Check that the returned element data is correct: diff_with_gt_labels = np.mean(np.abs(data - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index. # Checking that all labels are returned correctly: sorted_labels = sorted(p.get_ids_list()) self.assertEqual( np.sum(~np.asarray(sorted_labels) == np.asarray( range(num_elements))), 0) # Delete data1 labels1, _ = p.knn_query(data1, k=1) for l in labels1: p.mark_deleted(l[0]) labels2, _ = p.knn_query(data2, k=1) items = p.get_items(labels2) diff_with_gt_labels = np.mean(np.abs(data2 - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) # console labels1_after, _ = p.knn_query(data1, k=1) for la in labels1_after: for lb in labels1: if la[0] == lb[0]: self.assertTrue(False) print("All the data in data1 are removed") # checking saving/loading index with elements marked as deleted del_index_path = "with_deleted.bin" p.save_index(del_index_path) p = hnswlib.Index(space='l2', dim=dim) p.load_index(del_index_path) p.set_ef(100) labels1_after, _ = p.knn_query(data1, k=1) for la in labels1_after: for lb in labels1: if la[0] == lb[0]: self.assertTrue(False) os.remove(index_path) os.remove(del_index_path)
def dedupe(self, args): if not self.load_hashcache(): self.dump_hashcache() # check num_proc if args.num_proc is None: num_proc = max(cpu_count() - 1, 1) else: num_proc = args.num_proc # Use NGT by default if (not self.hnsw) and (not self.faiss_flat): try: import ngtpy except: logger.error( colored( "Error: Unable to load NGT. Please install NGT and python binding first.", 'red')) sys.exit(1) index_path = self.get_ngt_index_path() logger.warning( "Building NGT index (dimension={}, num_proc={})".format( self.hash_bits, num_proc)) ngtpy.create(path=index_path.encode(), dimension=self.hash_bits, edge_size_for_creation=args.ngt_edges, edge_size_for_search=args.ngt_edges_for_search, object_type="Byte", distance_type="Hamming") ngt_index = ngtpy.Index(index_path.encode()) ngt_index.batch_insert(self.hashcache.hshs(), num_proc) # NGT Approximate neighbor search logger.warning("Approximate neighbor searching using NGT") hshs = self.hashcache.hshs() filenames = self.hashcache.filenames() check_list = [0] * len(hshs) current_group_num = 1 if not args.query: for i in tqdm(range(len(hshs))): new_group_found = False if check_list[i] != 0: # already grouped image continue for res in ngt_index.search(hshs[i], size=args.ngt_k, epsilon=args.ngt_epsilon): if res[0] == i: continue else: if res[1] <= self.hamming_distance: if check_list[res[0]] == 0: if check_list[i] == 0: # new group new_group_found = True check_list[i] = current_group_num check_list[res[0]] = current_group_num self.group[current_group_num] = [ filenames[i] ] self.group[current_group_num].extend( [filenames[res[0]]]) else: # exists group exists_group_num = check_list[i] check_list[res[0]] = exists_group_num self.group[exists_group_num].extend( [filenames[res[0]]]) if new_group_found: current_group_num += 1 else: # query image new_group_found = False hsh = self.hashcache.gen_hash(args.query) self.group[current_group_num] = [] for res in ngt_index.search(hsh, size=args.ngt_k, epsilon=args.ngt_epsilon): if res[1] <= self.hamming_distance: new_group_found = True self.group[current_group_num].extend( [filenames[res[0]]]) if new_group_found: current_group_num += 1 # remove ngt index if index_path: os.system("rm -rf {}".format(index_path)) elif self.hnsw: try: import hnswlib except: logger.error( colored( "Error: Unable to load hnsw. Please install hnsw python binding first.", 'red')) sys.exit(1) hshs = self.hashcache.hshs() filenames = self.hashcache.filenames() num_elements = len(hshs) hshs_labels = np.arange(num_elements) hnsw_index = hnswlib.Index(space='l2', dim=self.hash_bits) # Squared L2 hnsw_index.init_index(max_elements=num_elements, ef_construction=args.hnsw_ef_construction, M=args.hnsw_m) hnsw_index.set_ef(max(args.hnsw_ef, args.hnsw_k - 1)) # ef should always be > k hnsw_index.set_num_threads(num_proc) logger.warning( "Building hnsw index (dimension={}, num_proc={})".format( self.hash_bits, num_proc)) hnsw_index.add_items(hshs, hshs_labels, num_proc) # hnsw Approximate neighbor search logger.warning("Approximate neighbor searching using hnsw") check_list = [0] * num_elements current_group_num = 1 if not args.query: for i in tqdm(range(num_elements)): new_group_found = False if check_list[i] != 0: # already grouped image continue labels, distances = hnsw_index.knn_query( hshs[i], k=args.hnsw_k, num_threads=num_proc) for label, distance in zip(labels[0], distances[0]): if label == i: continue else: if distance <= self.hamming_distance: if check_list[label] == 0: if check_list[i] == 0: # new group new_group_found = True check_list[i] = current_group_num check_list[label] = current_group_num self.group[current_group_num] = [ filenames[i] ] self.group[current_group_num].extend( [filenames[label]]) else: # exists group exists_group_num = check_list[i] check_list[label] = exists_group_num self.group[exists_group_num].extend( [filenames[label]]) if new_group_found: current_group_num += 1 else: # query image new_group_found = False hsh = self.hashcache.gen_hash(args.query) self.group[current_group_num] = [] labels, distances = hnsw_index.knn_query(hsh, k=args.hnsw_k, num_threads=num_proc) for label, distance in zip(labels[0], distances[0]): if distance <= self.hamming_distance: new_group_found = True self.group[current_group_num].extend( [filenames[label]]) if new_group_found: current_group_num += 1 elif self.faiss_flat: try: import faiss except: logger.error( colored( "Error: Unable to load faiss. Please install faiss python binding first.", 'red')) sys.exit(1) hshs = self.hashcache.hshs() filenames = self.hashcache.filenames() faiss.omp_set_num_threads(num_proc) logger.warning( "Building faiss index (dimension={}, num_proc={})".format( self.hash_bits, num_proc)) data = np.array(hshs).astype('float32') faiss_flat_index = faiss.IndexFlatL2( self.hash_bits) # Exact search faiss_flat_index.add(data) # faiss Exact neighbor search logger.warning("Exact neighbor searching using faiss") check_list = [0] * faiss_flat_index.ntotal current_group_num = 1 if not args.query: for i in tqdm(range(faiss_flat_index.ntotal)): new_group_found = False if check_list[i] != 0: # already grouped image continue distances, labels = faiss_flat_index.search( data[[i]], args.faiss_flat_k) for label, distance in zip(labels[0], distances[0]): if label == i: continue else: if distance <= self.hamming_distance: if check_list[label] == 0: if check_list[i] == 0: # new group new_group_found = True check_list[i] = current_group_num check_list[label] = current_group_num self.group[current_group_num] = [ filenames[i] ] self.group[current_group_num].extend( [filenames[label]]) else: # exists group exists_group_num = check_list[i] check_list[label] = exists_group_num self.group[exists_group_num].extend( [filenames[label]]) if new_group_found: current_group_num += 1 else: # query image new_group_found = False hsh = np.array([self.hashcache.gen_hash(args.query) ]).astype('float32') self.group[current_group_num] = [] distances, labels = faiss_flat_index.search( hsh, args.faiss_flat_k) for label, distance in zip(labels[0], distances[0]): if distance <= self.hamming_distance: new_group_found = True self.group[current_group_num].extend( [filenames[label]]) if new_group_found: current_group_num += 1 # sort self.group if self.sort != 'none': self.sort_group() # write duplicate log file self.num_duplicate_set = current_group_num - 1 if self.num_duplicate_set > 0 and args.log: now = datetime.now().strftime('%Y%m%d%H%M%S') duplicate_log_file = "{}_{}".format(now, self.get_duplicate_log_name()) with open(duplicate_log_file, 'w') as f: if args.query: f.write("Query: {}\n\n".format(args.query)) for k in range(1, self.num_duplicate_set + 1): img_list = self.group[k] pad = 1 if args.query else 0 if len(img_list) + pad > 1: sorted_img_list, _, _, _ = self.sort_image_list( img_list) if args.sameline: f.write(" ".join(sorted_img_list) + "\n") else: f.write("\n".join(sorted_img_list) + "\n") if k != len(self.group): f.write("\n")
import hnswlib import numpy as np dim = 2000 num_elements = 1000 session = 'piano' data = np.loadtxt(f"../mfcc_extract/{session}_mfcc.txt", dtype=float) data_labels = np.arange(len(data)) #data_titles = [ t.strip()[:-8] if t.strip()[-8:] == '_320kbps' else t.strip() for t in open("../mfcc_extract/titles.txt", "r")] p = hnswlib.Index(space='cosine', dim=dim) p.init_index(max_elements=num_elements, ef_construction=200, M=16) p.add_items(data, data_labels) p.set_ef(50) # ef should always be > k print(len(data)) # labels, distances = p.knn_query(data[:10], k=5) # for i, nn in enumerate(labels): # print(f"{data_titles[i]} : ") # for n in nn: print(data_titles[n]) index_path = f'{session}_hnsw.bin' print("Saving index to '%s'" % index_path) p.save_index(index_path) del p
def load_index(index_path, dim): p = hnswlib.Index(space='cosine', dim=dim) print("\nLoading index from '%s'" % index_path) p.load_index(index_path) p.set_ef(50) return p
model_name = 'distilbert-multilingual-nli-stsb-quora-ranking' model = LanguageTransformer(model_name) url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" dataset_path = "quora_duplicate_questions.tsv" max_corpus_size = 100000 embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format( model_name.replace('/', '_'), max_corpus_size) embedding_size = 768 #Size of embeddings top_k_hits = 10 #Output k hits #Defining our hnswlib index #We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity index = hnswlib.Index(space='cosine', dim=embedding_size) #Check if embedding cache path exists if not os.path.exists(embedding_cache_path): # Check if the dataset exists. If not, download and extract # Download dataset if needed if not os.path.exists(dataset_path): print("Download dataset") util.http_get(url, dataset_path) # Get all unique sentences from the file corpus_sentences = set() with open(dataset_path, encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
def testRandomSelf(self): for idx in range(16): print("\n**** Index resize test ****\n") np.random.seed(idx) dim = 16 num_elements = 10000 # Generating sample data data = np.float32(np.random.random((num_elements, dim))) # Declaring index p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip # Initiating index # max_elements - the maximum number of elements, should be known beforehand # (probably will be made optional in the future) # # ef_construction - controls index search speed/build speed tradeoff # M - is tightly connected with internal dimensionality of the data # strongly affects the memory consumption p.init_index(max_elements=num_elements // 2, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search p.set_ef(20) p.set_num_threads(idx % 8) # by default using all available cores # We split the data in two batches: data1 = data[:num_elements // 2] data2 = data[num_elements // 2:] print("Adding first batch of %d elements" % (len(data1))) p.add_items(data1) # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) items = p.get_items(list(range(len(data1)))) # Check the recall: self.assertAlmostEqual( np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) # Check that the returned element data is correct: diff_with_gt_labels = np.max(np.abs(data1 - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) print("Resizing the index") p.resize_index(num_elements) print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) items = p.get_items(list(range(num_elements))) # Check the recall: self.assertAlmostEqual( np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) # Check that the returned element data is correct: diff_with_gt_labels = np.max(np.abs(data - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # Checking that all labels are returned correctly: sorted_labels = sorted(p.get_ids_list()) self.assertEqual( np.sum(~np.asarray(sorted_labels) == np.asarray( range(num_elements))), 0)
scene_dirs = [ path.join(collection_dir, d, "image") for d in listdir(collection_dir) if path.isdir(path.join(collection_dir, d)) ] image_paths = [[path.join(d, e) for e in listdir(d)][0] for d in scene_dirs] return [(p[len(sun_rgbd_directory) + 1:], p) for p in image_paths] images = list_image_paths(path.join( sun_rgbd_directory, "kv2", "kinect2data")) + list_image_paths( path.join(sun_rgbd_directory, "kv2", "align_kv2")) database = PatchGraphDatabase() desc_index = hnswlib.Index(space='l2', dim=descriptor_size) desc_index.init_index(max_elements=7000000, ef_construction=200, M=16) desc_index.set_ef(50) def extract_features_to_db(image_path, image_name, scene_node, size): patches = extract_image_features(image_path, image_name, size) insert_patches_result = database.insert_patches(patches) print("inserted patch nodes", len(insert_patches_result)) data = np.array([p['des'] for p in insert_patches_result], dtype=np.float32) data_labels = np.array([p['id'] for p in insert_patches_result]) desc_index.add_items(data, data_labels)