def lsh(p_hash_size, distance_func): """ 实现局部敏感哈希模拟KNN的具体函数 :param p_hash_size: 与vipno的总数(去重后)相乘构成最终的hash_size :param distance_funcs: 可选择的距离计算函数 :return: 去除自身之后的该vipno对应knn的输出vipno """ datas_set, datas_matrix = get_data() # vipno_nums 为vipno去重后的总数 vipno_nums = len(datas_matrix[0]) # 随机取一个vipno(这里是vipno对应的下标) random_vipno = random.randint(0, vipno_nums - 1) # 初始化lshash lsh = LSHash(int(vipno_nums * p_hash_size), len(datas_matrix[:, 0])) for i in range(vipno_nums): # extra_data为当前列对应的vipno值,作为之后输出的时候所想要的knn的输出vipno lsh.index(datas_matrix[:, i], extra_data=datas_set.columns[i]) vipno_res = [] # num_results可以限制输出的结果个数,这里取前6个,因为第一个为输入列本身 for res in lsh.query(datas_matrix[:, random_vipno], num_results=6, distance_func=distance_func): vipno_res.append(res[0][1]) print("distance func:", distance_func) print("knn output(from 1 to 5): {}".format(vipno_res[1:])) return vipno_res[1:], datas_set.columns[random_vipno]
def getLSHashOutput(filename, hash_size, k): matrix = getMatrix(filename) total_num = len(matrix.iloc[0]) lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0])) for i in range(total_num): lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i]) out_num = rand.randint(0, total_num - 1) #有多种lshash函数,默认是euclidean print(lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean'))
def sphere(): X = np.random.normal(size=(1000, 3)) lsh = LSHash(10, 3, num_hashtables=5) for x in X: x /= np.linalg.norm(x) lsh.index(x) closest = lsh.query(X[0] + np.array([-0.001, 0.001, -0.001]), distance_func="cosine") assert len(closest) >= 10 assert 0.05 >= closest[9][-1] > 0.0003
def hyperspheres(X=16, num_samples=200000): """ Demonstrate curse of dimensionality and where LSH starts to fail Returns: lsh, X, secondclosest, tenthclosest >>> import pandas as pd >>> lsh, vectors, dfs = hyperspheres(16) >>> for df in dfs: ... print(df) """ X = np.random.uniform(size=(num_samples, X)) if isinstance(X, int) else X closest = [] secondclosest = [] tenthclosest = [] hundredthclosest = [] for D in range(2, X.shape[1] + 1): lsh = LSHash(int(64 / D) + D, D, num_hashtables=D) # query vector q = np.random.uniform(size=(D,)) q /= np.linalg.norm(q) distances = [] for x in X[:, :D]: x /= np.linalg.norm(x) distances += [1. - np.sum(x * q)] # cosine similarity lsh.index(x) distances = sorted(distances) print(distances[:10]) closest10 = lsh.query(q, distance_func='cosine') N = len(closest10) hundredthclosest += [[D, N, closest10[min(99, N - 1)][-1] if N else 2., distances[min(99, N - 1)]]] tenthclosest += [[D, N, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]]] secondclosest += [[D, N, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]]] closest += [[D, N, closest10[0][-1] if N else 2., distances[0]]] print("is correct: 100th 10th 2nd 1st") print(round(hundredthclosest[-1][-1], 14) == round(hundredthclosest[-1][-2], 14)) print(round(tenthclosest[-1][-1], 14) == round(tenthclosest[-1][-2], 14)) print(round(secondclosest[-1][-1], 14) == round(secondclosest[-1][-2], 14)) print(round(closest[-1][-1], 14) == round(closest[-1][-2], 14)) print("distances: 100th 10th 2nd 1st") print(hundredthclosest[-1]) print(tenthclosest[-1]) print(secondclosest[-1]) print(closest[-1]) dfs = [] for k, (i, df) in enumerate(zip([100, 10, 2, 1], [hundredthclosest, tenthclosest, secondclosest, closest])): df = pd.DataFrame(df, columns='D N dist{} true_dist{}'.format(i, i).split()).round(14) df['correct{}'.format(i)] = df['dist{}'.format(i)] == df['true_dist{}'.format(i)] dfs += [df] # for i, tc in enumerate(tenthclosest): # assert 1e-9 < tc[-2] or 1e-6 < 0.2 return lsh, X, dfs
def b(r, dim, vector): lsh = LSHash(r, dim) for n, v in xxx: lsh.index(v.tolist()) start = time.perf_counter() q = lsh.query(vector.tolist(), 10, 'cosine') end = time.perf_counter() qq = [(x, 1 - y) for x, y in q] if len(qq) > 0: return qq[0][1], end - start else: return -2, end - start
def lsh(p_hash_size, distance_funcs): """ 实现局部敏感哈希模拟KNN的具体函数 :param p_hash_size: 与vipno的总数(去重后)相乘构成最终的hash_size :param distance_funcs: 可选择的距离计算函数 :return: 去除自身之后的该vipno对应knn的输出vipno """ datas_set, datas_matrix = get_data() # vipno_nums 为vipno去重后的总数 vipno_nums = len(datas_matrix[0]) # 随机取一个vipno(这里是vipno对应的下标) random_vipno = random.randint(0, vipno_nums - 1) # 初始化lshash lsh = LSHash(int(vipno_nums * p_hash_size), len(datas_matrix[:, 0])) for i in range(vipno_nums): # extra_data为当前列对应的vipno值,作为之后输出的时候所想要的knn的输出vipno lsh.index(datas_matrix[:, i], extra_data=datas_set.columns[i]) print("hash size: {}".format(vipno_nums * p_hash_size)) # print("distance func:", distance_func) print("input vipno: {}".format(datas_set.columns[random_vipno])) # vipno_res = [] ends = [] for distance_func in distance_funcs: start = datetime.datetime.now() vipno_res = [] # num_results可以限制输出的结果个数,这里取前6个,因为第一个为输入列本身 for res in lsh.query(datas_matrix[:, random_vipno], num_results=6, distance_func=distance_func): vipno_res.append(res[0][1]) end = (datetime.datetime.now() - start).total_seconds() ends.append(end) print("distance func:", distance_func) print("knn output(from 1 to 5): {}".format(vipno_res[1:])) print("time:", end) # 做时间性能比较图 plt.bar(distance_funcs, ends, alpha=0.9, width=0.35, facecolor='lightskyblue', edgecolor='white', label='time', lw=1) plt.legend(loc="upper left") plt.show()
def hyperspheres_10D(X=np.random.uniform(size=(200000, 10))): """ Demonstrate curse of dimensionality and where LSH starts to fail Returns: lsh, X, secondclosest, tenthclosest >>> import pandas as pd >>> lsh, vectors, rank1, rank2, rank10 = test_hyperspheres() >>> pd.DataFrame(rank2) >>> pd.DataFrame(rank10) """ tenthclosest = [] secondclosest = [] closest = [] for D in range(2, X.shape[1]): lsh = LSHash(int(1024 * 8182. / D) + D, D, num_hashtables=D) # query vector q = np.random.uniform(size=(D, )) q /= np.linalg.norm(q) distances = [] for x in X[:, :D]: lsh.index(x) x /= np.linalg.norm(x) distances += [1. - np.sum(x * q)] # cosine similarity distances = sorted(distances) print(distances[:10]) closest10 = lsh.query(q, distance_func='cosine') N = len(closest10) tenthclosest += [[ D, min(9, N - 1) if N else -1, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)] ]] secondclosest += [[ D, min(1, N - 1) if N else -1, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)] ]] closest += [[ D, 0 if N else -1, closest10[0][-1] if N else 2., distances[0] ]] print(tenthclosest[-1]) print(secondclosest[-1]) print(closest[-1]) # for i, tc in enumerate(tenthclosest): # assert 1e-9 < tc[-2] or 1e-6 < 0.2 return lsh, X, closest, secondclosest, tenthclosest
def knn(df, k, coefficient): hash_size = int(coefficient * df.shape[1]) lsh = LSHash(hash_size, input_dim=df.shape[0]) for vipno in df: lsh.index(df[vipno], extra_data=vipno) random_column = df[df.columns.to_series().sample(1)] random_vip = random_column.columns.values[0] logging.info('random vipno: {}'.format(random_vip)) res = lsh.query(random_column.values.flatten())[0:k + 1] logging.info('vipno in ranked order using kNN(k = {}):'.format(k)) knns = [] for item in res: if item[0][1] != random_vip: logging.info(item[0][1]) knns.append(item[0][1]) return random_vip, knns[:5]
def getLSHashOutput(filename, hash_size, k): matrix = getMatrix(filename) list = [] for i in range(matrix.shape[1]): list.append(matrix.iloc[i]) total_num = len(matrix.iloc[0]) lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0])) for i in range(total_num): lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i]) out_num = rand.randint(0, total_num - 1) #有多种lshash函数,默认是euclidean m = lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean') print("输入的vipno是" + str(matrix.columns[out_num]) + "\n其桶中的vipno有:") bucket = [] for i in range(len(m)): print(m[i][0][1]) tag = np.argwhere(matrix.columns == m[i][0][1]) bucket.append(int(tag)) return bucket
returnvec[vocablist.index(word)] += 1 else: print('word:', word, 'is not in the list_vec') return returnvec if __name__ == '__main__': datalist, classlist, vocabset = textprocess('./paper') # 获取每篇论文的词集 stop_word_file = './stopwords_cn.txt' stop_word_set = make_word_set(stop_word_file) feature_words = word_dict(vocabset, 0, stop_word_set) trainMat = [] lsh = LSHash(hash_size=10, input_dim=len(feature_words)) for postinDoc in datalist: trainMat_vec = bagof_word2vec(feature_words, postinDoc) # 训练集向量化 trainMat.append(trainMat_vec) lsh.index(trainMat_vec) testfile = './test.txt' testlist = [] with open(testfile, 'r', encoding='utf-8') as f: sequence = f.read() testlist.append(jieba.lcut(sequence, cut_all=False)) testvect = bagof_word2vec(feature_words, testlist[0]) re = lsh.query(testvect, num_results=1) print(list(re[0][0])) print(trainMat.index(list(re[0][0]))) print('最相似的论文是:', classlist[trainMat.index(list(re[0][0]))])
count = np.zeros(DBconnection.DBconnection.count(a.dbconnect_to_collection())) index2 = tfidf.indptr[DBconnection.DBconnection.count(a.dbconnect_to_collection())] for i in range(DBconnection.DBconnection.count(a.dbconnect_to_collection()), DBconnection.DBconnection.count(b.dbconnect_to_collection()) - 1): b = [] j = 0 while j < (tfidf.indptr[i + 1] - tfidf.indptr[i]): if j > 7: break b.append(round(tfidf.data[index2 + j], 2)) j += 1 if len(b) < 8: for index in range(8 - len(b)): b.append(1) fianlresu = lsh.query(b) whileflag = 0 while not fianlresu and whileflag < 3: fianlresu = lsh.query(b) whileflag += 1 if not fianlresu: count[0] += 1 print(assinglist[i]) else: checklist = [] for elem in fianlresu[0][0]: checklist.append(elem) count[centriodSet.index(checklist)] += 1
class ImageSearchEngine(object): """A simple image search engine based on ORB, Kmeans and LSHash.""" def __init__(self): self._all_feats = [] self._img_dict = {} self._kmeans = None self._lsh = None def load_images(self, img_list: list) -> int: """Load images, extract features using ORB for indexing. Args: img_list: list of image files' names. Returns: count of image files successfully loaded. """ count = 0 progress_bar = tqdm(total=len(img_list)) for img_name in img_list: try: img = cv2.imread(img_name, cv2.IMREAD_GRAYSCALE) _, features = orb.detectAndCompute(img, None) # Record index of features for this image start_index, num_feats = len(self._all_feats), len(features) self._img_dict[img_name] = {'start_index': start_index, 'num_feats': num_feats} # Append new featurs self._all_feats.extend([feat for feat in features]) count += 1 except Exception as e: logger.warning(e) logger.warning('Error processing {}'.format(img_name)) progress_bar.update(1) progress_bar.close() logger.info('Successfully loaded {} images, extracted {} features.' .format(count, len(self._all_feats))) return count def build_index(self, k: int, hash_size: int = 10, num_hashtables: int = 1, store_file: str = None, overwrite: bool = False): """Build index for each picture. First use K-means to find k key features from previously extracted features and the assignment of each feature; Then apply histogram on each image, get the distribution of its features, which serves as a unique finger print for this image. Finally use LSHash (locality sensitive hashing.) algorithm, index each image by their histogram array. Args: k: parameter used in K-means, number of centeroids (key features). hash_size: length of resulting binary hash array. num_hashtables: number of hashtables for multiple lookups. store_file: Specify the path to the .npz file random matrices are stored or to be stored if the file does not exist yet overwrite: Whether to overwrite the matrices file if it already exist. Returns: """ assert 0 < k < len(self._all_feats) assert hash_size > 0 and num_hashtables > 0 # Use kmeans to calculate K key features and assignment of each feature. logger.info('Calculating {} key featurs...'.format(k)) # Mini batch kmeans deals with large amount of data better. self._kmeans = MiniBatchKMeans(n_clusters=k) self._kmeans.fit(np.array(self._all_feats)) idx = self._kmeans.labels_ logger.info('Start indexing each image.') # Calculate histogram of each image self._lsh = LSHash(hash_size=hash_size, input_dim=k, num_hashtables=num_hashtables, matrices_filename=store_file, overwrite=overwrite) success = 0 progress_bar = tqdm(total=len(self._img_dict)) bins = np.arange(-0.5, k + 0.5, 1) for img_name, img_meta in self._img_dict.items(): try: start = img_meta['start_index'] end = start + img_meta['num_feats'] # Perform histogram hist, _ = np.histogram(idx[start:end], bins=bins) img_meta['histogram'] = hist # Store each picture in hash tables self._lsh.index(input_point=hist, extra_data=img_name) success += 1 except Exception as e: logger.warning(e) logger.warning('Error when indexing image: {}'.format(img_name)) progress_bar.update(1) progress_bar.close() logger.info('Successfully indexed {} images.'.format(success)) def search(self, img_name: str, num_results: int = None, distance_func: str = None) -> list: """Search image. Args: img_name: name of image file to searched. num_results: The number of query results to return in ranked order. By default all results will be returned. distance_func: The distance function to be used, in ("hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"). By default "euclidean" will used. Returns: list of names of match images. """ assert self._lsh is not None and self._kmeans is not None res = [] try: img = cv2.imread(img_name, cv2.IMREAD_GRAYSCALE) _, features = orb.detectAndCompute(img, None) idx = self._kmeans.predict(features) bins = np.arange(-0.5, len(self._kmeans.cluster_centers_) + 0.5, 1) hist, _ = np.histogram(idx, bins=bins) res = self._lsh.query(hist, num_results=num_results, distance_func=distance_func) except Exception as e: logger.warning(e) return res def dump(self, pkl_file: str = 'model.pkl'): with open(pkl_file, 'wb') as f: pickle.dump(self, f) @property def num_images(self) -> int: return len(self._img_dict)
def test_lshash(): lsh = LSHash(6, 8) # 对于输入数据为8维的数据创建6位hash lsh.index([1, 2, 3, 4, 5, 6, 7, 8]) lsh.index([2, 3, 4, 5, 6, 7, 8, 9]) lsh.index([10, 12, 99, 1, 5, 31, 2, 3]) print(lsh.query([1, 2, 3, 4, 5, 6, 7, 7]))
# In[10]: o = open('lsh_output.txt', 'w') # create a file to write the results # loop with different hash size for e in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]: lsh = LSHash(round(n_vip * e), n_plu) for v in vipno: feature = list(trade_mat[v]) lsh.index(feature, extra_data=v) # pick up a random vipno pick_vip = random.randint(1, n_vip) pick_vip = vipno[pick_vip] o.write("Hash_size = {} * n_plu \n".format(e)) o.write("Pick up a vip: {}\n".format(pick_vip)) # lsh query and write the results candi = lsh.query(list(trade_mat[pick_vip]), 6, distance_func='hamming') # print(len(candi)) for i, item in enumerate(candi[1:]): dist = item[1] feature = list(item[0][0]) v = item[0][1] o.write("Top {0} : vipno = {1}, distance = {2}\n".format( i + 1, v, dist)) o.write("\n") o.close() print("The lshash results have been saved in file 'lsh_output.txt'.")
class LocalSensitiveHash(object): def __init__(self, hash_size, input_dim, num_of_hashtables=1, storage=None, matrices_filename=None, overwrite=False): """ Attributes: :param hash_size: The length of the resulting binary hash in integer.E.g., 32 means the resulting binary hash will be 32 - bit long. :param input_dim: The dimension of the input vector.E.g., a grey - scale picture of 30x30 pixels will have an input dimension of 900. :param num_hashtables: (optional) The number of hash tables used for multiple lookups. :param storage_config: (optional) A dictionary of the form `{backend_name: config}` where `backend_name` is the either `dict` or `redis`, and `config` is the configuration used by the backend. For `redis`it should be in the format of`{"redis": {"host": hostname, "port": port_num}}`, where `hostname` is normally `localhost` and `port` is normally 6379. :param matrices_filename: (optional) Specify the path to the compressed numpy file endin with extension `.npz`, where the uniform random planes are stored, or to be stored if the file does not exist yet. :paramoverwrite: (optional) Whether to overwrite the matrices file if it already exist """ self.hash_object = LSHash( hash_size=hash_size, # 二进制hash 结果的长度 input_dim=input_dim, # 输入向量的维度 num_of_hashtables=num_of_hashtables, # 用于多次查找的哈希表的数目。可选项 storage=storage, # (可选)指定用于索引存储的存储的名称。选项包括“redis” matrices_filename= matrices_filename, # (可选)指定.npz文件的路径随机矩阵被存储, 如果文件不存在 overwrite=overwrite) # 如果matrices文件存在,是否对其进行覆盖, 可选项 # 从给定的局部敏感hash实例中索引数据点 def lsh_index(self, input_point, extra_data=None): """ :param input_point: 为一个数组或远祖,大小为input_dim维度 :param extra_data: 可选项,附加数据将与input_point一起添加。 :return: """ self.hash_object.index(input_point=input_point, extra_data=extra_data) # 根据给定的LSHash 实例检索一个数据点 def lsh_query(self, query_point, num_results=None, distance_fun="euclidean"): assert distance_fun in { "hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm" } """ :param query_point: 检索的数据殿是一个数组或元组,大小为input_dim :param num_results: # (可选)按顺序返回的查询结果的数量。默认情况下,将返回所有结果。 :param distance_fun: # (可选)排序距离函数用于排序候选集, 默认使用的欧氏距离 距离可使用的参数 ("hamming", 汉明距离 "euclidean", 欧式距离 "true_euclidean", 真欧式距离 "centred_euclidean", 中心欧式距离 "cosine", 余弦距离 "l1norm") l1 正则化 :return: """ return self.hash_object.query(query_point=query_point, num_results=num_results, distance_func=distance_fun)
image = preprocess(image) image = np.expand_dims(image, axis=0) image_pred_features = model.predict(image)[0] lsh.index(image_pred_features.flatten(), extra_data=image_path) pickle.dump(lsh, open('pick_keras/lsh.p', "wb")) lsh = pickle.load(open('pick_keras/lsh.p', 'rb')) def getVgg16Features(image_path): image = load_img(image_path) image = image.resize(inputShape) image = img_to_array(image) image = preprocess(image) image = np.expand_dims(image, axis=0) image_pred_features = model.predict(image)[0] return image_pred_features # search images input_path = '101_ObjectCategories/car_side/image_0085.jpg' q_features = getVgg16Features(input_path) n_items = 5 response = lsh.query(q_features.flatten(), num_results=n_items + 1, distance_func='hamming') for i in range(len(response)): img_path = response[i][0][1] print(img_path)
# @Time : 2017/10/15 21:35 # @Author : Jalin Hu # @File : note.py # @Software: PyCharm from lshash.lshash import LSHash if __name__ == '__main__': lsh = LSHash(hash_size=6, input_dim=8) lsh.index([1, 2, 3, 4, 5, 6, 7, 8]) lsh.index([2, 3, 4, 5, 6, 7, 8, 9]) lsh.index([3, 4, 5, 6, 7, 8, 9, 10]) lsh.index([10, 12, 99, 1, 5, 6, 24, 20]) res = lsh.query([1, 2, 3, 4, 5, 6, 7, 7], num_results=2) print(res)