def cloud_main(self, file_count): with open('encrypted_vectors/feature_vectors.json') as data_file: feature_loaded = json.load(data_file) with open('encrypted_indices/indices.json') as data_file: indices_loaded = json.load(data_file) with open('encrypted_query/query_vectors.json') as data_file: query_loaded = json.load(data_file) d = Decrypt() feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded)) indices = d.decrypt_indices_vector(bytes(indices_loaded)) query_vectors = d.decrypt_indices_vector(bytes(query_loaded)) feature_vectors = np.frombuffer(feature_vectors, dtype=int) feature_vectors = np.reshape(feature_vectors, (file_count, -1)) indices = json.loads(indices.decode()) query_vectors = np.frombuffer(query_vectors, dtype=int) query_vectors = np.reshape(query_vectors, (-1, 6)) print(feature_vectors.shape, indices, query_vectors.shape) l = LSH(feature_vectors, indices) n_neighbors, result = l.query(query_vectors, 6, 45) print(n_neighbors) cursor.execute(sql_select_Query) records = cursor.fetchall() for row in records: if row[0] - 1 in result: image_name = row[1] print(image_name) CloudAPISender().cloud_api_sender(image_name) # Closing the connection conn.close()
def query(fname, key='key', topk=10, truncate=80): model = pickle.load(open(CONST.MODEL, 'rb')) dataframe = pd.read_csv(CONST.DATASET) corpus = TfidfVectorizer().fit_transform(dataframe['content']) lsh = LSH(corpus, model) index = dataframe[dataframe[key].apply(str) == str(fname)].index[0] dataframe['content'] = dataframe['content'].str[:int(truncate)] return lsh.query(corpus[index, :], int(topk), 10)[0].join(dataframe, on='id').sort_values('distance').iloc[:, 1:]
def cloud_main(self, file_count): #establishing the connection conn = mysql.connector.connect(user='******', password='******', host='127.0.0.1', database='ImageRetrieval') #Creating a cursor object using the cursor() method cursor = conn.cursor() # Preparing SQL query to select a record from the database. sql_select_Query = "select * from images" with open('encrypted_vectors/feature_vectors.json') as data_file: feature_loaded = json.load(data_file) with open('encrypted_indices/indices.json') as data_file: indices_loaded = json.load(data_file) with open('encrypted_query/query_vectors.json') as data_file: query_loaded = json.load(data_file) d = Decrypt() feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded)) indices = d.decrypt_indices_vector(bytes(indices_loaded)) query_vectors = d.decrypt_indices_vector(bytes(query_loaded)) feature_vectors = np.frombuffer(feature_vectors, dtype=int) feature_vectors = np.reshape(feature_vectors, (file_count, -1)) indices = json.loads(indices.decode()) query_vectors = np.frombuffer(query_vectors, dtype=int) query_vectors = np.reshape(query_vectors, (-1, 6)) print(feature_vectors.shape, indices, query_vectors.shape) l = LSH(feature_vectors, indices) n_neighbors, result = l.query(query_vectors, 6, 45) print(n_neighbors) cursor.execute(sql_select_Query) records = cursor.fetchall() for row in records: if row[0] - 1 in result: image_name = row[1] print(image_name) CloudAPISender().cloud_api_sender(image_name) # Closing the connection conn.close()
vect_length = tfidf.vect_length # length of the input vector num_hashtables = 1 # number of iterations digest_length = 0 print 'perform lsh' lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables) for i, k in enumerate(tfidf._id_list): vect = tfidf.get_vector(i) lsh.index(vect, extra_data=tfidf._id_list[i]) ''' Query documents ''' dedup = set() keys = lsh.hash_tables[0].keys() i = 0 for key in keys: bucket = lsh.hash_tables[0].get_val(key) for query_object in bucket: candidates = lsh.query(query_object[0], distance_func='cosine') for c in candidates: candidate_key = c[0][ 1] # warc id is appended as extra data in lsh.index() if candidate_key == query_object[1]: continue if str(query_object[1]) <= str(candidate_key): candidate_distance = c[1] if float(candidate_distance) >= threshold: dedup.add((query_object[1], candidate_key, candidate_distance, 'false')) else: dedup.add((query_object[1], candidate_key, candidate_distance, 'true')) sys.stdout.write("\rDoing thing %i" % i) sys.stdout.flush()
class DND: MAX_SIZE = 25 TM = 0.1 def __init__(self, N, D, K, L): self.lsh = LSH(SimHash(D, K, L), K, L) self.keys = np.zeros((N, D), dtype=np.float32) self.values = np.zeros((N, 1), dtype=np.float32) self.lru = np.zeros(N, dtype=np.float32) self.key2idx = dict() self.size = 0 self.max_memory = N self.K = K self.L = L def __contains__(self, key): return tuple(key) in self.key2idx def __getitem__(self, key): try: index = self.key2idx[tuple(key)] self.lru[index] += DND.TM return self.values[index] except: return None def __setitem__(self, key, value): item = tuple(key) try: # 1) Find memory index for key vector index = self.key2idx[item] except: # 2) Add key vector if not present if self.size >= self.max_memory: # 3) If memory is full, select LRU memory index and remove from LSH hash tables index = np.argmin(self.lru) self.lsh.erase(self.keys[index], index) else: index = self.size self.size += 1 # Rehash key into LSH hash tables self.lsh.insert(key, index) self.key2idx[item] = index # Add new key to memory self.keys[index] = key finally: # Update memory value self.values[index] = value self.lru[index] += DND.TM def retrieve(self, query): # Collect memory indices from LSH hash tables indices, cL = self.lsh.query(query.data, DND.MAX_SIZE) # Gather keys and values from memory keys = self.keys[indices] values = self.values[indices] self.lru[indices] += DND.TM assert (keys.shape[0] == values.shape[0]) return keys, values, indices, cL
num_hashtables=num_hashtables) for i, k in enumerate(tfidf._id_list): vect = tfidf.get_vector(i) lsh.index(vect, extra_data=tfidf._id_list[i]) ''' Query documents ''' log += str(time.asctime(time.localtime(time.time()))) + '\n' log += 'query documents\n' print time.asctime(time.localtime(time.time())) print 'Query documents' distance_func = "cosine" corr = set() for i, key in enumerate(tfidf._id_list): query_object = tfidf.get_vector(i) candidates = lsh.query(query_object, distance_func=distance_func) for c in candidates: candidate_key = c[0][ 1] # warc id is appended as extra data in lsh.index() candidate_distance = c[1] if str(key) < str(candidate_key): candidate = (key, candidate_key, candidate_distance) corr.add(candidate) corr_list = list(corr) corr_list = sorted(corr_list, key=lambda x: x[2]) f = open('correspondences/' + domain + '_sim.txt', 'w') for t in corr_list: f.write(t[0] + ' ' + t[1] + ' ' + str(t[2]) + '\n') f.close()
class TestLsh(TestCase): """TODO: Test Case docstring goes here.""" def setUp(self): self.lsh = LSH(3, 2, 1) self.lsh_two_tables = LSH(3, 2, 2) # Overwrite randomly initalized planes with known values. self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])] self.lsh_two_tables.planes = [ np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]), np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]), ] def test_hashing(self): vector_ones = [1, 1] # This will add each plane without a scalar. # each value greater than zero will append a 1 to the string, 0 otherwise. self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_ones), "100") vector_twos = [-2, 2] self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_twos), "101") def test_table_indexing(self): self.lsh.index([1, 1], "data1") self.lsh.index([-2, 2], "data2") self.assertDictEqual(self.lsh.hash_tables[0], { "100": [([1, 1], "data1")], "101": [([-2, 2], "data2")] }) self.lsh_two_tables.index([1, 1], "data1") self.lsh_two_tables.index([-2, 2], "data2") self.assertDictEqual( self.lsh_two_tables.hash_tables[0], { "100": [([1, 1], "data1")], "101": [([-2, 2], "data2")] }, ) self.assertDictEqual( self.lsh_two_tables.hash_tables[1], { "010": [([1, 1], "data1")], "011": [([-2, 2], "data2")] }, ) def test_query(self): self.lsh.index([1, 1], "data1") self.lsh.index([-2, 2], "data2") output = self.lsh.query([1, 1], 1) self.assertEqual(output, ["data1"]) self.lsh_two_tables.index([1, 1], "data1") self.lsh_two_tables.index([-2, 2], "data2") output = self.lsh_two_tables.query([1, 1], 1) self.assertEqual(output, ["data1"]) self.lsh_two_tables.index([-1, -1], "data3") self.lsh_two_tables.index([6, 6], "data4") self.lsh_two_tables.index([-10, -10], "data5") output = self.lsh_two_tables.query([6, 6], 2) self.assertEqual(output, ["data4", "data1"])
if __name__ == "__main__": data, q = load_data() # Brute Force nn_brute, nn_brute_dist = brute_force_nn(q, data) # Ball Tree bt = BallTree(data, 10) nn_bt = bt.query_top_down(q) # LSH hash_fn_gen = lambda: guassian_hash_generator(150, data.shape[1]) lsh = LSH(data, hash_fn_gen, 1, 10) nn_lsh, performace_limit = lsh.query(q) # Ball tree LSH print('balltree lsh performance limit', performace_limit) lsh = LSH(data, hash_fn_gen, 1, 3) bt_lsh = BallTreeLSH(bt, lsh) nn_bt_lsh = bt_lsh.query(q, performance_limit=performace_limit) # compare_results _, nn_lsh_to_q = brute_force_nn(nn_lsh, q) _, nn_bt_to_q = brute_force_nn(nn_bt, q) _, nn_bt_lsh_to_q = brute_force_nn(nn_bt_lsh, q) ######### # Stats # #########
def train(device, data, schedule, mi_type, args): model = MI_Estimator(device, D=d, ED=ed, HD=256) model.to(device) model.train() optimizer = optim.Adam(model.parameters(), lr=5e-4) xs, ys = data xs = xs.to(device) ys = ys.to(device) lsh = LSH(SimHash(ed, K, L), K, L) estimates = [] avg_estimate = [] id_set = set() n_iters = num_iterations * batch_size for batch_idx in range(n_iters): iteration = batch_idx // batch_size MI = schedule[iteration] t = 10 if batch_idx <= 1000 else 100 if batch_idx % t == 0: build(lsh, model, xs) optimizer.zero_grad() y = ys[batch_idx:batch_idx + 1] ey = model.embed_y(y) id_list = lsh.query(ey) id_set = id_set.union(set(id_list)) indices = torch.LongTensor(id_list).to(device) nx = F.embedding(indices, xs) px = xs[batch_idx:batch_idx + 1] x = torch.cat([px, nx], dim=0) x = torch.unsqueeze(x, dim=0) mi = model(x, y, args) loss = -mi loss.backward() optimizer.step() avg_estimate.append(mi.item()) if (batch_idx + 1) % 100 == 0: ''' asim = model.cosine_similarity(x, y) true = torch.mean(torch.diag(asim)) neye = 1. - torch.eye(batch_size).to(device) noise = torch.sum(torch.mul(asim, neye)).item() / (batch_size * (batch_size-1)) print("MI:{} true: {:.4f}, noise: {:.4f}".format(MI, true, noise)) ''' avg_mi = sum(avg_estimate) / float(len(avg_estimate)) print('{} {} MI:{}, E_MI: {:.6f}'.format(mi_type.name, batch_idx + 1, MI, avg_mi)) sys.stdout.flush() if (batch_idx + 1) % wsize == 0: print(len(id_set), len(id_set) // wsize) id_set.clear() avg_mi = sum(avg_estimate) / float(len(avg_estimate)) estimates.append(avg_mi) avg_estimate.clear() lsh.stats() return estimates