Exemplo n.º 1
0
def lsh(p_hash_size, distance_func):
    """
    实现局部敏感哈希模拟KNN的具体函数
    :param p_hash_size: 与vipno的总数(去重后)相乘构成最终的hash_size
    :param distance_funcs: 可选择的距离计算函数
    :return: 去除自身之后的该vipno对应knn的输出vipno
    """
    datas_set, datas_matrix = get_data()
    # vipno_nums 为vipno去重后的总数
    vipno_nums = len(datas_matrix[0])

    # 随机取一个vipno(这里是vipno对应的下标)
    random_vipno = random.randint(0, vipno_nums - 1)

    # 初始化lshash
    lsh = LSHash(int(vipno_nums * p_hash_size), len(datas_matrix[:, 0]))
    for i in range(vipno_nums):
        # extra_data为当前列对应的vipno值,作为之后输出的时候所想要的knn的输出vipno
        lsh.index(datas_matrix[:, i], extra_data=datas_set.columns[i])

    vipno_res = []
    # num_results可以限制输出的结果个数,这里取前6个,因为第一个为输入列本身
    for res in lsh.query(datas_matrix[:, random_vipno],
                         num_results=6,
                         distance_func=distance_func):
        vipno_res.append(res[0][1])

    print("distance func:", distance_func)
    print("knn output(from 1 to 5): {}".format(vipno_res[1:]))

    return vipno_res[1:], datas_set.columns[random_vipno]
Exemplo n.º 2
0
def getLSHashOutput(filename, hash_size, k):
    matrix = getMatrix(filename)
    total_num = len(matrix.iloc[0])
    lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0]))
    for i in range(total_num):
        lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i])
    out_num = rand.randint(0, total_num - 1)
    #有多种lshash函数,默认是euclidean
    print(lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean'))
Exemplo n.º 3
0
def sphere():
    X = np.random.normal(size=(1000, 3))
    lsh = LSHash(10, 3, num_hashtables=5)
    for x in X:
        x /= np.linalg.norm(x)
        lsh.index(x)
    closest = lsh.query(X[0] + np.array([-0.001, 0.001, -0.001]), distance_func="cosine")
    assert len(closest) >= 10
    assert 0.05 >= closest[9][-1] > 0.0003
Exemplo n.º 4
0
def hyperspheres(X=16, num_samples=200000):
    """ Demonstrate curse of dimensionality and where LSH starts to fail

    Returns:
      lsh, X, secondclosest, tenthclosest

    >>> import pandas as pd
    >>> lsh, vectors, dfs = hyperspheres(16)
    >>> for df in dfs:
    ...     print(df)
    """
    X = np.random.uniform(size=(num_samples, X)) if isinstance(X, int) else X
    closest = []
    secondclosest = []
    tenthclosest = []
    hundredthclosest = []
    for D in range(2, X.shape[1] + 1):
        lsh = LSHash(int(64 / D) + D, D, num_hashtables=D)

        # query vector
        q = np.random.uniform(size=(D,))
        q /= np.linalg.norm(q)

        distances = []
        for x in X[:, :D]:
            x /= np.linalg.norm(x)
            distances += [1. - np.sum(x * q)]  # cosine similarity
            lsh.index(x)
        distances = sorted(distances)
        print(distances[:10])
        closest10 = lsh.query(q, distance_func='cosine')

        N = len(closest10)
        hundredthclosest += [[D, N, closest10[min(99, N - 1)][-1] if N else 2., distances[min(99, N - 1)]]]
        tenthclosest += [[D, N, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]]]
        secondclosest += [[D, N, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]]]
        closest += [[D, N, closest10[0][-1] if N else 2., distances[0]]]
        print("is correct: 100th 10th 2nd 1st")
        print(round(hundredthclosest[-1][-1], 14) == round(hundredthclosest[-1][-2], 14))
        print(round(tenthclosest[-1][-1], 14) == round(tenthclosest[-1][-2], 14))
        print(round(secondclosest[-1][-1], 14) == round(secondclosest[-1][-2], 14))
        print(round(closest[-1][-1], 14) == round(closest[-1][-2], 14))
        print("distances: 100th 10th 2nd 1st")
        print(hundredthclosest[-1])
        print(tenthclosest[-1])
        print(secondclosest[-1])
        print(closest[-1])
    dfs = []
    for k, (i, df) in enumerate(zip([100, 10, 2, 1], [hundredthclosest, tenthclosest, secondclosest, closest])):
        df = pd.DataFrame(df, columns='D N dist{} true_dist{}'.format(i, i).split()).round(14)
        df['correct{}'.format(i)] = df['dist{}'.format(i)] == df['true_dist{}'.format(i)]
        dfs += [df]
    # for i, tc in enumerate(tenthclosest):
    #     assert 1e-9 < tc[-2] or 1e-6 < 0.2
    return lsh, X, dfs
Exemplo n.º 5
0
def b(r, dim, vector):
    lsh = LSHash(r, dim)
    for n, v in xxx:
        lsh.index(v.tolist())
    start = time.perf_counter()
    q = lsh.query(vector.tolist(), 10, 'cosine')
    end = time.perf_counter()
    qq = [(x, 1 - y) for x, y in q]
    if len(qq) > 0:
        return qq[0][1], end - start
    else:
        return -2, end - start
Exemplo n.º 6
0
def lsh(p_hash_size, distance_funcs):
    """
    实现局部敏感哈希模拟KNN的具体函数
    :param p_hash_size: 与vipno的总数(去重后)相乘构成最终的hash_size
    :param distance_funcs: 可选择的距离计算函数
    :return: 去除自身之后的该vipno对应knn的输出vipno
    """
    datas_set, datas_matrix = get_data()
    # vipno_nums 为vipno去重后的总数
    vipno_nums = len(datas_matrix[0])

    # 随机取一个vipno(这里是vipno对应的下标)
    random_vipno = random.randint(0, vipno_nums - 1)

    # 初始化lshash
    lsh = LSHash(int(vipno_nums * p_hash_size), len(datas_matrix[:, 0]))
    for i in range(vipno_nums):
        # extra_data为当前列对应的vipno值,作为之后输出的时候所想要的knn的输出vipno
        lsh.index(datas_matrix[:, i], extra_data=datas_set.columns[i])

    print("hash size: {}".format(vipno_nums * p_hash_size))
    # print("distance func:", distance_func)
    print("input vipno: {}".format(datas_set.columns[random_vipno]))
    # vipno_res = []

    ends = []
    for distance_func in distance_funcs:
        start = datetime.datetime.now()
        vipno_res = []
        # num_results可以限制输出的结果个数,这里取前6个,因为第一个为输入列本身
        for res in lsh.query(datas_matrix[:, random_vipno],
                             num_results=6,
                             distance_func=distance_func):
            vipno_res.append(res[0][1])
        end = (datetime.datetime.now() - start).total_seconds()
        ends.append(end)
        print("distance func:", distance_func)
        print("knn output(from 1 to 5): {}".format(vipno_res[1:]))
        print("time:", end)

    # 做时间性能比较图
    plt.bar(distance_funcs,
            ends,
            alpha=0.9,
            width=0.35,
            facecolor='lightskyblue',
            edgecolor='white',
            label='time',
            lw=1)
    plt.legend(loc="upper left")
    plt.show()
Exemplo n.º 7
0
def hyperspheres_10D(X=np.random.uniform(size=(200000, 10))):
    """ Demonstrate curse of dimensionality and where LSH starts to fail

    Returns:
      lsh, X, secondclosest, tenthclosest

    >>> import pandas as pd
    >>> lsh, vectors, rank1, rank2, rank10 = test_hyperspheres()
    >>> pd.DataFrame(rank2)
    >>> pd.DataFrame(rank10)
    """
    tenthclosest = []
    secondclosest = []
    closest = []
    for D in range(2, X.shape[1]):
        lsh = LSHash(int(1024 * 8182. / D) + D, D, num_hashtables=D)

        # query vector
        q = np.random.uniform(size=(D, ))
        q /= np.linalg.norm(q)

        distances = []
        for x in X[:, :D]:
            lsh.index(x)
            x /= np.linalg.norm(x)
            distances += [1. - np.sum(x * q)]  # cosine similarity
        distances = sorted(distances)
        print(distances[:10])
        closest10 = lsh.query(q, distance_func='cosine')

        N = len(closest10)
        tenthclosest += [[
            D,
            min(9, N - 1) if N else -1,
            closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]
        ]]
        secondclosest += [[
            D,
            min(1, N - 1) if N else -1,
            closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]
        ]]
        closest += [[
            D, 0 if N else -1, closest10[0][-1] if N else 2., distances[0]
        ]]
        print(tenthclosest[-1])
        print(secondclosest[-1])
        print(closest[-1])
    # for i, tc in enumerate(tenthclosest):
    #     assert 1e-9 < tc[-2] or 1e-6 < 0.2
    return lsh, X, closest, secondclosest, tenthclosest
Exemplo n.º 8
0
def knn(df, k, coefficient):
    hash_size = int(coefficient * df.shape[1])
    lsh = LSHash(hash_size, input_dim=df.shape[0])
    for vipno in df:
        lsh.index(df[vipno], extra_data=vipno)
    random_column = df[df.columns.to_series().sample(1)]
    random_vip = random_column.columns.values[0]
    logging.info('random vipno: {}'.format(random_vip))
    res = lsh.query(random_column.values.flatten())[0:k + 1]
    logging.info('vipno in ranked order using kNN(k = {}):'.format(k))
    knns = []
    for item in res:
        if item[0][1] != random_vip:
            logging.info(item[0][1])
            knns.append(item[0][1])
    return random_vip, knns[:5]
def getLSHashOutput(filename, hash_size, k):
    matrix = getMatrix(filename)
    list = []
    for i in range(matrix.shape[1]):
        list.append(matrix.iloc[i])
    total_num = len(matrix.iloc[0])
    lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0]))
    for i in range(total_num):
        lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i])
    out_num = rand.randint(0, total_num - 1)
    #有多种lshash函数,默认是euclidean
    m = lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean')
    print("输入的vipno是" + str(matrix.columns[out_num]) + "\n其桶中的vipno有:")
    bucket = []
    for i in range(len(m)):
        print(m[i][0][1])
        tag = np.argwhere(matrix.columns == m[i][0][1])
        bucket.append(int(tag))
    return bucket
Exemplo n.º 10
0
            returnvec[vocablist.index(word)] += 1
        else:
            print('word:', word, 'is not in the list_vec')
    return returnvec


if __name__ == '__main__':
    datalist, classlist, vocabset = textprocess('./paper')  # 获取每篇论文的词集
    stop_word_file = './stopwords_cn.txt'
    stop_word_set = make_word_set(stop_word_file)
    feature_words = word_dict(vocabset, 0, stop_word_set)
    trainMat = []

    lsh = LSHash(hash_size=10, input_dim=len(feature_words))
    for postinDoc in datalist:
        trainMat_vec = bagof_word2vec(feature_words, postinDoc)  # 训练集向量化
        trainMat.append(trainMat_vec)
        lsh.index(trainMat_vec)

    testfile = './test.txt'
    testlist = []
    with open(testfile, 'r', encoding='utf-8') as f:
        sequence = f.read()
        testlist.append(jieba.lcut(sequence, cut_all=False))
        testvect = bagof_word2vec(feature_words, testlist[0])

    re = lsh.query(testvect, num_results=1)
    print(list(re[0][0]))
    print(trainMat.index(list(re[0][0])))
    print('最相似的论文是:', classlist[trainMat.index(list(re[0][0]))])
Exemplo n.º 11
0
count = np.zeros(DBconnection.DBconnection.count(a.dbconnect_to_collection()))

index2 = tfidf.indptr[DBconnection.DBconnection.count(a.dbconnect_to_collection())]
for i in range(DBconnection.DBconnection.count(a.dbconnect_to_collection()),
               DBconnection.DBconnection.count(b.dbconnect_to_collection()) - 1):
    b = []
    j = 0
    while j < (tfidf.indptr[i + 1] - tfidf.indptr[i]):
        if j > 7:
            break
        b.append(round(tfidf.data[index2 + j], 2))
        j += 1
    if len(b) < 8:
        for index in range(8 - len(b)):
            b.append(1)
    fianlresu = lsh.query(b)
    whileflag = 0
    while not fianlresu and whileflag < 3:
        fianlresu = lsh.query(b)
        whileflag += 1

    if not fianlresu:
        count[0] += 1
        print(assinglist[i])

    else:
        checklist = []
        for elem in fianlresu[0][0]:
            checklist.append(elem)
        count[centriodSet.index(checklist)] += 1
Exemplo n.º 12
0
class ImageSearchEngine(object):
    """A simple image search engine based on ORB, Kmeans and LSHash."""

    def __init__(self):
        self._all_feats = []
        self._img_dict = {}
        self._kmeans = None
        self._lsh = None

    def load_images(self, img_list: list) -> int:
        """Load images, extract features using ORB for indexing.

        Args:
            img_list: list of image files' names.

        Returns:
            count of image files successfully loaded.
        """
        count = 0
        progress_bar = tqdm(total=len(img_list))
        for img_name in img_list:
            try:
                img = cv2.imread(img_name, cv2.IMREAD_GRAYSCALE)
                _, features = orb.detectAndCompute(img, None)
                # Record index of features for this image
                start_index, num_feats = len(self._all_feats), len(features)
                self._img_dict[img_name] = {'start_index': start_index,
                                            'num_feats': num_feats}
                # Append new featurs
                self._all_feats.extend([feat for feat in features])
                count += 1
            except Exception as e:
                logger.warning(e)
                logger.warning('Error processing {}'.format(img_name))
            progress_bar.update(1)
        progress_bar.close()
        logger.info('Successfully loaded {} images, extracted {} features.'
                    .format(count, len(self._all_feats)))
        return count

    def build_index(self, k: int, hash_size: int = 10, num_hashtables: int = 1,
                    store_file: str = None, overwrite: bool = False):
        """Build index for each picture.
        First use K-means to find k key features from previously extracted features and the assignment of each feature;

        Then apply histogram on each image, get the distribution of its features, which serves as a unique finger print for this image.

        Finally use LSHash (locality sensitive hashing.) algorithm, index each image by their histogram array.

        Args:
            k: parameter used in K-means, number of centeroids (key features).
            hash_size: length of resulting binary hash array.
            num_hashtables: number of hashtables for multiple lookups.
            store_file: Specify the path to the .npz file random matrices are stored or to be stored if the file does not exist yet
            overwrite: Whether to overwrite the matrices file if it already exist.

        Returns:

        """
        assert 0 < k < len(self._all_feats)
        assert hash_size > 0 and num_hashtables > 0

        # Use kmeans to calculate K key features and assignment of each feature.
        logger.info('Calculating {} key featurs...'.format(k))
        # Mini batch kmeans deals with large amount of data better.
        self._kmeans = MiniBatchKMeans(n_clusters=k)
        self._kmeans.fit(np.array(self._all_feats))
        idx = self._kmeans.labels_
        logger.info('Start indexing each image.')

        # Calculate histogram of each image
        self._lsh = LSHash(hash_size=hash_size,
                           input_dim=k,
                           num_hashtables=num_hashtables,
                           matrices_filename=store_file,
                           overwrite=overwrite)
        success = 0
        progress_bar = tqdm(total=len(self._img_dict))
        bins = np.arange(-0.5, k + 0.5, 1)
        for img_name, img_meta in self._img_dict.items():
            try:
                start = img_meta['start_index']
                end = start + img_meta['num_feats']
                # Perform histogram
                hist, _ = np.histogram(idx[start:end], bins=bins)
                img_meta['histogram'] = hist
                # Store each picture in hash tables
                self._lsh.index(input_point=hist, extra_data=img_name)
                success += 1
            except Exception as e:
                logger.warning(e)
                logger.warning('Error when indexing image: {}'.format(img_name))
            progress_bar.update(1)
        progress_bar.close()
        logger.info('Successfully indexed {} images.'.format(success))

    def search(self, img_name: str, num_results: int = None,
               distance_func: str = None) -> list:
        """Search image.

        Args:
            img_name: name of image file to searched.
            num_results: The number of query results to return in ranked order. By default all results will be returned.
            distance_func: The distance function to be used, in ("hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm").
                By default "euclidean" will used.

        Returns:
            list of names of match images.
        """
        assert self._lsh is not None and self._kmeans is not None
        res = []
        try:
            img = cv2.imread(img_name, cv2.IMREAD_GRAYSCALE)
            _, features = orb.detectAndCompute(img, None)
            idx = self._kmeans.predict(features)
            bins = np.arange(-0.5, len(self._kmeans.cluster_centers_) + 0.5, 1)
            hist, _ = np.histogram(idx, bins=bins)
            res = self._lsh.query(hist, num_results=num_results,
                                  distance_func=distance_func)
        except Exception as e:
            logger.warning(e)
        return res

    def dump(self, pkl_file: str = 'model.pkl'):
        with open(pkl_file, 'wb') as f:
            pickle.dump(self, f)

    @property
    def num_images(self) -> int:
        return len(self._img_dict)
Exemplo n.º 13
0
def test_lshash():
    lsh = LSHash(6, 8)  # 对于输入数据为8维的数据创建6位hash
    lsh.index([1, 2, 3, 4, 5, 6, 7, 8])
    lsh.index([2, 3, 4, 5, 6, 7, 8, 9])
    lsh.index([10, 12, 99, 1, 5, 31, 2, 3])
    print(lsh.query([1, 2, 3, 4, 5, 6, 7, 7]))
Exemplo n.º 14
0
# In[10]:

o = open('lsh_output.txt', 'w')  # create a file to write the results

# loop with different hash size
for e in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]:
    lsh = LSHash(round(n_vip * e), n_plu)
    for v in vipno:
        feature = list(trade_mat[v])
        lsh.index(feature, extra_data=v)

    # pick up a random vipno
    pick_vip = random.randint(1, n_vip)
    pick_vip = vipno[pick_vip]
    o.write("Hash_size = {} * n_plu \n".format(e))
    o.write("Pick up a vip: {}\n".format(pick_vip))

    # lsh query and write the results
    candi = lsh.query(list(trade_mat[pick_vip]), 6, distance_func='hamming')
    #     print(len(candi))
    for i, item in enumerate(candi[1:]):
        dist = item[1]
        feature = list(item[0][0])
        v = item[0][1]
        o.write("Top {0} : vipno = {1}, distance = {2}\n".format(
            i + 1, v, dist))
    o.write("\n")

o.close()
print("The lshash results have been saved in file 'lsh_output.txt'.")
Exemplo n.º 15
0
class LocalSensitiveHash(object):
    def __init__(self,
                 hash_size,
                 input_dim,
                 num_of_hashtables=1,
                 storage=None,
                 matrices_filename=None,
                 overwrite=False):
        """
        Attributes:
        :param hash_size:
            The length of the resulting binary hash in integer.E.g., 32 means the resulting binary hash will be 32 - bit long.

        :param input_dim:
            The dimension of the input vector.E.g., a grey - scale picture of 30x30 pixels will have an input dimension of 900.

        :param num_hashtables:
            (optional) The number of hash tables used for multiple lookups.

        :param storage_config:
            (optional) A dictionary of the form `{backend_name: config}` where `backend_name` is the either `dict` or `redis`,
            and `config` is the configuration used by the backend.
            For `redis`it should be in the format of`{"redis": {"host": hostname, "port": port_num}}`,
            where `hostname` is normally `localhost` and `port` is normally 6379.

        :param matrices_filename:
            (optional) Specify the path to the compressed numpy file endin with extension `.npz`, where the uniform random planes
            are stored, or to be stored if the file does not exist yet.

        :paramoverwrite:
            (optional) Whether to overwrite the matrices file if it already exist
        """
        self.hash_object = LSHash(
            hash_size=hash_size,  # 二进制hash  结果的长度
            input_dim=input_dim,  # 输入向量的维度
            num_of_hashtables=num_of_hashtables,  # 用于多次查找的哈希表的数目。可选项
            storage=storage,  # (可选)指定用于索引存储的存储的名称。选项包括“redis”
            matrices_filename=
            matrices_filename,  # (可选)指定.npz文件的路径随机矩阵被存储, 如果文件不存在
            overwrite=overwrite)  # 如果matrices文件存在,是否对其进行覆盖, 可选项

    # 从给定的局部敏感hash实例中索引数据点
    def lsh_index(self, input_point, extra_data=None):
        """
        :param input_point:  为一个数组或远祖,大小为input_dim维度
        :param extra_data: 可选项,附加数据将与input_point一起添加。
        :return:
        """
        self.hash_object.index(input_point=input_point, extra_data=extra_data)

    # 根据给定的LSHash 实例检索一个数据点
    def lsh_query(self,
                  query_point,
                  num_results=None,
                  distance_fun="euclidean"):
        assert distance_fun in {
            "hamming", "euclidean", "true_euclidean", "centred_euclidean",
            "cosine", "l1norm"
        }
        """
        :param query_point:  检索的数据殿是一个数组或元组,大小为input_dim
        :param num_results:  # (可选)按顺序返回的查询结果的数量。默认情况下,将返回所有结果。
        :param distance_fun: # (可选)排序距离函数用于排序候选集, 默认使用的欧氏距离
        距离可使用的参数
        ("hamming",   汉明距离
         "euclidean",  欧式距离
         "true_euclidean", 真欧式距离
         "centred_euclidean",  中心欧式距离
         "cosine",  余弦距离
         "l1norm") l1 正则化
        :return:
        """
        return self.hash_object.query(query_point=query_point,
                                      num_results=num_results,
                                      distance_func=distance_fun)
Exemplo n.º 16
0
    image = preprocess(image)
    image = np.expand_dims(image, axis=0)
    image_pred_features = model.predict(image)[0]
    lsh.index(image_pred_features.flatten(), extra_data=image_path)

pickle.dump(lsh, open('pick_keras/lsh.p', "wb"))

lsh = pickle.load(open('pick_keras/lsh.p', 'rb'))


def getVgg16Features(image_path):
    image = load_img(image_path)
    image = image.resize(inputShape)
    image = img_to_array(image)
    image = preprocess(image)
    image = np.expand_dims(image, axis=0)
    image_pred_features = model.predict(image)[0]
    return image_pred_features


# search images
input_path = '101_ObjectCategories/car_side/image_0085.jpg'
q_features = getVgg16Features(input_path)
n_items = 5
response = lsh.query(q_features.flatten(),
                     num_results=n_items + 1,
                     distance_func='hamming')

for i in range(len(response)):
    img_path = response[i][0][1]
    print(img_path)
Exemplo n.º 17
0
# @Time    : 2017/10/15 21:35
# @Author  : Jalin Hu
# @File    : note.py
# @Software: PyCharm
from lshash.lshash import LSHash
if __name__ == '__main__':
    lsh = LSHash(hash_size=6, input_dim=8)
    lsh.index([1, 2, 3, 4, 5, 6, 7, 8])
    lsh.index([2, 3, 4, 5, 6, 7, 8, 9])
    lsh.index([3, 4, 5, 6, 7, 8, 9, 10])
    lsh.index([10, 12, 99, 1, 5, 6, 24, 20])
    res = lsh.query([1, 2, 3, 4, 5, 6, 7, 7], num_results=2)
    print(res)