Python LeanMinHash 예제들, datasketch.LeanMinHash Python 예제들

예제 #1

0

파일 보기

 def test_init(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     m2 = MinHash(4, 1, hashfunc=fake_hash_func)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     self.assertTrue(np.array_equal(lm1.hashvalues, lm2.hashvalues))
     self.assertTrue(np.array_equal(lm1.seed, lm2.seed))

예제 #2

0

파일 보기

파일: main.py 프로젝트: gracefan2020/findopendata

def joinable_column_search():
    query_id = request.args.get('id', None, type=uuid.UUID)
    if query_id == None:
        return jsonify([])
    limit = request.args.get('limit', default=50, type=int)
    original_host_filter = tuple(request.args.getlist('original_host'))
    cnx = cnxpool.getconn()
    # Obtain the MinHash of the query.
    with cnx.cursor(cursor_factory=RealDictCursor) as cursor:
        _execute_get_column_sketches(cursor, (query_id,))
        query = cursor.fetchone()
    if query is None:
        # The query does not exist.
        cnxpool.putconn(cnx)
        abort(404)
    # Query the LSH Server.
    try:
        resp = requests.post(lshserver_endpoint+"/query",
                json={"seed": query["seed"], "minhash": query["minhash"]})
        resp.raise_for_status()
    except requests.exceptions.HTTPError as err:
        app.logger.error("Error in querying the LSH server: {}".format(err))
        cnxpool.putconn(cnx)
        abort(500)
    column_ids = [column_id for column_id in resp.json()
            if column_id != str(query_id)]
    if len(column_ids) == 0:
        # Return empty result.
        cnxpool.putconn(cnx)
        return jsonify([])
    # Create the final query results.
    results = []
    query_minhash = LeanMinHash(seed=query["seed"], hashvalues=query["minhash"])
    # Obtain the column sketches of the results.
    with cnx.cursor(cursor_factory=RealDictCursor) as cursor:
        _execute_get_column_sketches(cursor, tuple(column_ids),
                original_hosts=original_host_filter)
        for column in cursor:
            # Skip columns from query table.
            if column["package_file_id"] == query["package_file_id"]:
                continue
            # Compute the similarities for each column in the result.
            jaccard = query_minhash.jaccard(LeanMinHash(
                    seed=column["seed"], hashvalues=column["minhash"]))
            containment = _containment(jaccard, column["distinct_count"],
                    query["distinct_count"])
            column.pop("seed")
            column.pop("minhash")
            column["jaccard"] = jaccard
            column["containment"] = containment
            if len(results) < limit:
                heapq.heappush(results,
                        (containment, column["id"], dict(column)))
            else:
                heapq.heappushpop(results,
                        (containment, column["id"], dict(column)))
    # Done with SQL.
    cnxpool.putconn(cnx)
    results = [column for _, _, column in heapq.nlargest(limit, results)]
    return jsonify(results)

예제 #3

0

파일 보기

파일: lean_minhash_test.py 프로젝트: zjiaksmc/datasketch

 def test_init(self):
     m1 = MinHash(4, 1, hashobj=FakeHash)
     m2 = MinHash(4, 1, hashobj=FakeHash)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     self.assertTrue(np.array_equal(lm1.hashvalues, lm2.hashvalues))
     self.assertTrue(np.array_equal(lm1.seed, lm2.seed))

예제 #4

0

파일 보기

 def test_union(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     m2 = MinHash(4, 1, hashfunc=fake_hash_func)
     m2.update(12)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     u = LeanMinHash.union(lm1, lm2)
     self.assertTrue(u.jaccard(lm2) == 1.0)

예제 #5

0

파일 보기

 def test_update(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     try:
         lm1 = LeanMinHash(m1)
         lm1.update(12)
     except TypeError:
         pass
     else:
         raise Exception

예제 #6

0

파일 보기

파일: lean_minhash_test.py 프로젝트: zjiaksmc/datasketch

 def test_update(self):
     m1 = MinHash(4, 1, hashobj=FakeHash)
     try:
         lm1 = LeanMinHash(m1)
         lm1.update(12)
     except TypeError:
         pass
     else:
         raise Exception

예제 #7

0

파일 보기

파일: lean_minhash_test.py 프로젝트: zjiaksmc/datasketch

 def test_count(self):
     m = MinHash(hashobj=FakeHash)
     m.update(11)
     m.update(123)
     m.update(92)
     m.update(98)
     m.update(123218)
     m.update(32)
     lm = LeanMinHash(m)
     c = lm.count()
     self.assertGreaterEqual(c, 0)

예제 #8

0

파일 보기

 def test_count(self):
     m = MinHash(hashfunc=fake_hash_func)
     m.update(11)
     m.update(123)
     m.update(92)
     m.update(98)
     m.update(123218)
     m.update(32)
     lm = LeanMinHash(m)
     c = lm.count()
     self.assertGreaterEqual(c, 0)

예제 #9

0

파일 보기

파일: lean_minhash_test.py 프로젝트: zjiaksmc/datasketch

 def test_jaccard(self):
     m1 = MinHash(4, 1, hashobj=FakeHash)
     m2 = MinHash(4, 1, hashobj=FakeHash)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 1.0)
     m2.update(12)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 0.0)
     m1.update(13)
     lm1 = LeanMinHash(m1)
     self.assertTrue(lm1.jaccard(lm2) < 1.0)

예제 #10

0

파일 보기

 def test_jaccard(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     m2 = MinHash(4, 1, hashfunc=fake_hash_func)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 1.0)
     m2.update(12)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 0.0)
     m1.update(13)
     lm1 = LeanMinHash(m1)
     self.assertTrue(lm1.jaccard(lm2) < 1.0)

예제 #11

0

파일 보기

    def _train_LSH(self):

        # create LSH model
        lsh = MinHashLSH(num_perm=128, params=(5, 7))

        # train LSH model
        for dom in self.new_domains:

            # remove TLD
            tld_info = get_tld('http://' + dom,
                               as_object=True,
                               fail_silently=True)

            try:
                d = tld_info.domain
            except:
                continue

            # ignore super short new domains
            if len(d) <= 3: \
                    continue

            # create bigram set
            bigrams = [d[i:i + 2] for i in range(len(d) - 1)]
            bigrams = set(bigrams)
            minhash = MinHash(num_perm=128)
            for b in bigrams:
                minhash.update(b.encode('utf-8'))

            minhash_lean = LeanMinHash(minhash)

            lsh.insert(dom, minhash_lean)

        print('LSH Trained!')
        return lsh

예제 #12

0

파일 보기

파일: hash_utils.py 프로젝트: organisciak/HT-MinHash

def make_hash(htid, wordset, **kwargs):
    ''' Make a LeanMinHash and serialize as bytes '''
    m = MinHash(**kwargs)
    for wordid in wordset:
        m.update(str(wordid).encode('utf-8'))
    lm = LeanMinHash(m)
    return serialize_lm(htid, lm)

예제 #13

0

파일 보기

def make_hash(string, nperm=N_PERM):
    """ Generates the hash for any string represented as a list of words. MD5 hash used for speed,
    and global_perm is globally defined to avoid repeatedly calling the random number generator"""
    mh = MinHash(num_perm=nperm, permutations=global_perm, hashobj=md5)
    for word in string:
        mh.update(word.encode(encoding='utf-8'))
    return LeanMinHash(mh)

예제 #14

0

파일 보기

 def test_hash(self):
     m = MinHash(hashfunc=fake_hash_func)
     m.update(11)
     m.update(123)
     m.update(92)
     m.update(98)
     m.update(123218)
     m.update(32)
     lm1 = LeanMinHash(m)
     lm2 = LeanMinHash(m)
     self.assertEqual(hash(lm1), hash(lm2))
     m.update(444)
     lm3 = LeanMinHash(m)
     self.assertNotEqual(hash(lm1), hash(lm3))
     d = dict()
     d[lm1] = True
     self.assertTrue(d[lm2])

예제 #15

0

파일 보기

    def test_eq(self):
        m1 = MinHash(4, 1, hashfunc=fake_hash_func)
        m2 = MinHash(4, 1, hashfunc=fake_hash_func)
        m3 = MinHash(4, 2, hashfunc=fake_hash_func)
        m4 = MinHash(8, 1, hashfunc=fake_hash_func)
        m5 = MinHash(4, 1, hashfunc=fake_hash_func)
        m1.update(11)
        m2.update(12)
        m3.update(11)
        m4.update(11)
        m5.update(11)
        lm1 = LeanMinHash(m1)
        lm2 = LeanMinHash(m2)
        lm3 = LeanMinHash(m3)
        lm4 = LeanMinHash(m4)
        lm5 = LeanMinHash(m5)
        self.assertNotEqual(lm1, lm2)
        self.assertNotEqual(lm1, lm3)
        self.assertNotEqual(lm1, lm4)
        self.assertEqual(lm1, lm5)

        m1.update(12)
        m2.update(11)
        lm1 = LeanMinHash(m1)
        lm2 = LeanMinHash(m2)
        self.assertEqual(lm1, lm2)

예제 #16

0

파일 보기

    def test_pickle(self):
        m = MinHash(4, 1, hashfunc=fake_hash_func)
        m.update(123)
        m.update(45)
        lm = LeanMinHash(m)

        p = pickle.loads(pickle.dumps(lm))
        self.assertEqual(p.seed, lm.seed)
        self.assertTrue(np.array_equal(p.hashvalues, lm.hashvalues))

예제 #17

0

파일 보기

def deserialize_minhash(column):
    """
    Deserializes minhash binary file for the given column and returns the minhash
    @param column:
    @return:
    """
    file_path = f'{os.environ["WORKING_DIRECTORY"]}/results/minhashes/{column["table"]}.{column["column"]}.txt'
    if not os.path.isfile(file_path):
        serialize_min_hash([column])
    with open(file_path, 'rb') as file:
        minhash = LeanMinHash.deserialize(bytearray(file.read()))
    return minhash

예제 #18

0

파일 보기

파일: duplicates_remover.py 프로젝트: pavel-razgovorov/herramientas-analisis

    def _create_minhash_from_file(self, current_dir_path, filename):
        file_path = f'{current_dir_path}/{filename}'
        with open(file_path) as f:
            article = Article(**json.load(f))
            if not article.body:
                os.remove(file_path)
                return

            minhash = MinHash()
            for word in article.body.split(' '):
                minhash.update(word.encode('utf8'))
            lean_minhash = LeanMinHash(minhash)
            self.minhashes[file_path] = lean_minhash
            self.lsh.insert(file_path, lean_minhash)

예제 #19

0

파일 보기

파일: find_duplicates_lsh.py 프로젝트: ml-research/MoRT_NMI

def make_minhash_mapping(item, shingles: int, num_perm: int):
    doc_id, doc = item

    # Create MinHash
    shingles_set = set(ngrams(doc, shingles))
    m = MinHash(num_perm=num_perm)
    for s in shingles_set:
        s = ''.join(s).encode('utf8')
        m.update(s)

    # Convert to LeanMinHash
    m = LeanMinHash(m)

    return doc_id, m, len(shingles_set)

예제 #20

0

파일 보기

def serialize_min_hash(columns, override=False):
    """
    Writes min hash values to local files
    @param override:
    @param columns:
    @return:
    """
    for column in columns:
        output_file = f'{os.environ["WORKING_DIRECTORY"]}/results/minhashes/{column["table"]}.{column["column"]}.txt'
        if os.path.isfile(output_file) and not override:
            continue
        values = queryDatabase.get_distnct_column_values(column['table'], column)
        tokens = tokenize(values)
        minhash = MinHash(num_perm=NUM_PERM)
        for token in tokens:
            minhash.update(token.encode('utf8'))
        leanMinHash = LeanMinHash(minhash)
        buf = bytearray(leanMinHash.bytesize())
        leanMinHash.serialize(buf)
        with open(output_file, 'wb') as file:
            file.write(buf)
            print(f'Serialization is complete for {column["table"]}.{column["column"]}.')
    return

예제 #21

0

파일 보기

파일: generate_minhashes.py 프로젝트: yyht/openwebtext2

def process_file(file_path, tqdm_func, global_tqdm):
    reader = Reader()
    minhashes = []
    previous_file_position = 0
    for document, metadata in reader.read_jsonl(file_path, get_meta=True):

        n_grams = extract_ngrams(document, 5)
        five_gram_set = set(n_grams)
        minhash = MinHash(num_perm=10)
        for five_gram in five_gram_set:
            minhash.update(five_gram.encode('utf8'))
        minhashes.append(LeanMinHash(minhash))

        # Update Progress Bar
        current_file_position = reader.fh.tell()
        global_tqdm.update(current_file_position - previous_file_position)
        previous_file_position = current_file_position

    return file_path, minhashes

예제 #22

0

파일 보기

    def test_deserialize(self):
        m1 = MinHash(10, 1, hashfunc=fake_hash_func)
        m1.update(123)
        lm1 = LeanMinHash(m1)
        buf = bytearray(lm1.bytesize())
        lm1.serialize(buf)

        # Test if we get back the exact same LeanMinHash objects after
        # deserializing from bytes
        lm1d = LeanMinHash.deserialize(buf)
        self.assertEqual(lm1d.seed, lm1.seed)
        self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues))
        self.assertTrue(
            all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))

예제 #23

0

파일 보기

def _load_lib_profile(profile_path_n_mode_n_repackage):
    (profile_path, mode, repackage) = profile_path_n_mode_n_repackage

    analyzer = LibAnalyzer(profile_path)
    lib_name_version = "{}_{}".format(analyzer.lib_name, analyzer.lib_version)

    minhash_list = []
    relationship_graphs = None

    if len(analyzer.classes_names) >= SHRINK_MINIMUM_NUMBER:
        if mode == MODE.ACCURATE:
            relationship_graphs = analyzer.get_relationship_graphs(repackage)

        classes_signatures = analyzer.get_classes_signatures()
        signature_set = set()
        lib_class_num = 0

        for class_name in classes_signatures:
            if classes_signatures[class_name]:
                signature_set.update(classes_signatures[class_name])
                lib_class_num += 1

        for class_name in classes_signatures:
            class_signatures = classes_signatures[class_name]

            if class_signatures:
                m = MinHash(num_perm=LSH_PERM_NUM)
                for signature in class_signatures:
                    m.update(signature.encode('utf8'))

                lm = LeanMinHash(m)
                key = "{}|{}|{}|{}|{}|->{}".format(lib_name_version,
                                                   analyzer.root_package,
                                                   lib_class_num,
                                                   len(signature_set),
                                                   analyzer.category,
                                                   class_name)
                minhash_list.append((key, lm, len(class_signatures)))

    return (lib_name_version, minhash_list, relationship_graphs)

예제 #24

0

파일 보기

파일: lean_minhash_test.py 프로젝트: zjiaksmc/datasketch

    def test_deserialize_byteorder(self):
        for byteorder in "@=<>!":
            m1 = MinHash(10, 1, hashobj=FakeHash)
            m1.update(123)
            lm1 = LeanMinHash(m1)
            buf = bytearray(lm1.bytesize(byteorder))
            lm1.serialize(buf, byteorder)

            # Test if we get back the exact same LeanMinHash objects after
            # deserializing from bytes
            lm1d = LeanMinHash.deserialize(buf, byteorder)
            lm1d.hashobj = FakeHash
            self.assertEqual(lm1d.seed, lm1.seed)
            self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues))
            self.assertTrue(
                all(hvd == hv
                    for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))

예제 #25

0

파일 보기

파일: scraper.py 프로젝트: MikeKahn/spacetime-crawler4py

def tokenize_words(url, text):
    # preprocessing step, converts all characters to lowercase
    words = text.replace("  ", " ").replace("\n", " ").lower().strip()
    # the tokenizing step
    tokens = word_tokenize(words)
    # this is the list of valid tokens
    ftokens = []
    mh = MinHash(num_perm=128)
    # iterates through full token list to filter out invalid tokens
    for t in tokens:
        # do not include the token if it is a stopword
        if t in stopwords.words():
            continue
        # do not include the token if there are no alphanumeric characters
        if not re.match('[A-Za-z0-9]+', t):
            continue
        # remove any non-alphanumeric characters from the token
        t2 = re.sub('[^A-Za-z0-9]+', '', t).strip()
        # skip the token if it is an empty string
        if len(t2) <= 0:
            continue
        ftokens.append(t2)
        mh.update(t2.encode("utf8"))
    return ftokens, LeanMinHash(mh)

예제 #26

0

파일 보기

파일: commonhash_util.py 프로젝트: 39239580/res_sys_tool-new-

    def __init__(self, hash_type=None, bits=None, hash_func=None, params=None):
        self.hash_type = hash_type
        self.hash_func = hash_func
        self.hash_bits = bits
        self.hashfunc = sha1_hash32
        if self.hash_bits in {32, "32", None}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash32
            elif self.hash_func == "xxhash":
                self.hashfunc = xxhash.xxh32
            else:
                # "hash32","default":
                self.hashfunc = sha1_hash32

        elif self.hash_bits in {64, "64"}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash64
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash64
            elif self.hash_func == "xxhash":
                self.hashfunc = xxhash.xxh64
            else:
                self.hashfunc = sha1_hash64

        elif self.hash_bits in {128, "128"}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash128
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash128
            else:
                raise ValueError("请检查对应的hash函数类型与位数")

        else:
            raise ValueError("请检查对应的hash函数的位数")

        if not params:
            params = {}
        """
        若只用redis 作为存储截止
        配置
        storage_config={  
        'type': 'redis',
        'redis': {'host': 'localhost', 'port': 6379},
        }
                
        要顺序插入大量MinHash，建议使用插入会话。这样可以减少批量插入过程中的网络呼叫数量。
        data_list = [("m1", m1), ("m2", m2), ("m3", m3)]
        with lsh.insertion_session() as session:
            for key, minhash in data_list:
                session.insert(key, minhash)
        请注意，在打开插入会话期间查询LSH对象可能会导致不一致。
        
        MinHash LSH还支持Cassandra群集作为存储层。为您的LSH使用长期存储可解决应用程序需要不断更新LSH对象的所有用例（例如，
        当您使用MinHashLSH逐步对文档进行群集时）。
        Cassandra存储选项可以配置如下：
        
         storage_config={
        'type': 'cassandra',
        'cassandra': {
            'seeds': ['127.0.0.1'],
            'keyspace': 'lsh_test',
            'replication': {
                'class': 'SimpleStrategy',
                'replication_factor': '1',
            },
            'drop_keyspace': False,
            'drop_tables': False,
        }}
        参数Seeds指定可以联系以连接到Cassandra集群的种子节点列表。选项键空间和复制指定创建键空间（如果尚不存在）时要使用的参数。
        如果要强制创建表或键空间（因此要删除现有表或键空间），请将drop_tables和drop_keyspace选项设置为 True。
        像Redis副本一样，建议使用插入会话来减少批量插入期间的网络调用数量。
        
        +-----------------------连接到现有的最小哈希LSH-------------------------------------+ 
        如果您的LSH使用外部存储层（例如Redis），则可以跨多个进程共享它。有两种方法可以做到这一点：
        
        推荐的方法是使用“酸洗”。MinHash LSH对象是可序列化的，因此您可以调用pickle：
        
        import pickle
        
        # Create your LSH object
        lsh = ...
        # Serialize the LSH
        data = pickle.dumps(lsh)
        # Now you can pass it as an argument to a forked process or simply save it
        # in an external storage.
        
        # In a different process, deserialize the LSH
        lsh = pickle.loads(data)
        使用pickle，您可以保存有关LSH所需的所有知识，例如在一个位置中进行各种参数设置。
        另外，您可以在首次创建LSH时在存储配置中指定基本名称。例如：

        # For Redis.
        lsh = MinHashLSH(
            threshold=0.5, num_perm=128, storage_config={
                'type': 'redis',
                'basename': b'unique_name_6ac4fg',
                'redis': {'host': 'localhost', 'port': 6379},
            }
        )
        
         # For Cassandra.
         lsh = MinHashLSH(
            threashold=0.5, num_perm=128, storage_config={
                'type': 'cassandra',
                'basename': b'unique_name',
                'cassandra': {
                    'seeds': ['127.0.0.1'],
                    'keyspace': 'lsh_test',
                    'replication': {
                        'class': 'SimpleStrategy',
                        'replication_factor': '1',
                    },
                    'drop_keyspace': False,
                    'drop_tables': False,
                }
            }
        )
        的基名将用于生成在所述存储层中唯一地标识与该LSH相关联的数据键前缀。因此，如果使用相同的基名创建新的LSH对象，则将在与旧LSH关联的存储层中使用相同的基础数据。
        
        如果不指定basename，则MinHash LSH将生成一个随机字符串作为基本名称，并且极不可能发生冲突。
        
        更详细的使用见 文档 ：http://ekzhu.com/datasketch/lsh.html
        """

        if self.hash_type in {"minhash", "MinHash"}:
            # 主要计算Jaccard 的相似度， 使用较小的固定存储空间来估计线性时间内任意大小的集合之间的jaccard 相似度
            self.hash = MinHash(
                num_perm=params.get(
                    "num_perm",
                    128),  # int可选项， 如果hashvalues值不是None,则被忽略。随机排列函数的数量
                # 用来控制hash 的精度
                seed=params.get("seed", 1),  # 随机种子 可选
                hashfunc=self.
                hashfunc,  # 可选 使用的hash函数，将输入传递给update 方法。并返回一个可以用32位编码的整数
                hashobj=params.get("hashobj",
                                   None),  # Deprecated.已经被hashfunc 代替
                hashvalues=params.get("hashvalues", None),  # 可选 数组或列表
                permutations=params.get(
                    "permutations",
                    None))  # 置换函数参数， 可选，可使用另一个Minhash 的现有状态来指定此参数进行快速的初始化
        elif self.hash_type in {
                "weightedminhashlsh", "mhlsh", "WeightedMinHashLSH", "wmhlsh",
                "MinHashLSH"
        }:  # 加权的最小哈希局部敏感哈希
            #  WeightedMinHashLSH()   与 MinHashLSH 等价  。 加权jaccard 相似度 查询
            # 不支持top-k查询， 但minhashlshforest 支持top-k
            self.hash = MinHashLSH(
                threshold=params.get("threshold", 0.9),  # 杰卡德距离的阈值
                num_perm=params.get("num_perm",
                                    128),  # 置换函数设定个数， 在加权minihash 上的 样本规模大小
                weights=params.get("weights",
                                   (0.5, 0.5)),  # 元组， 可选项， 优化jaccard阈值
                params=params.get("params", None),  # 元组，可选项， – bands 的数量与规模大小
                storage_config=params.get("storage_config", None),  # 存储配置
                prepickle=params.get("prepickle", None))  # 默认使用pk格式存储
        elif self.hash_type in {"leanminhash", "lmh", "LeanMinHash", "LMH"}:
            # 相比MinHash 中，内存更小的哈希。
            self.hash = LeanMinHash(minhash=params.get("minhash", None),
                                    seed=params.get("seed", None),
                                    hashvalues=params.get("hashvalues", None))

        elif self.hash_type in {
                "MinHashLSHForest", "minhashlshforest", "mhlshf", "MHLSHF"
        }:
            self.hash = MinHashLSHForest(num_perm=params.get("num_perm", 128),
                                         l=params.get("l", 8))

        elif self.hash_type in {
                "MinHashLSHEnsemble", "MinHashLSHEnsemble", "mhlshe", "MHLSHE"
        }:
            # 使用新距离做的minhashlsh操作 ， 即使用Containment 中文简称为遏制
            self.hash = MinHashLSHEnsemble(
                threshold=params.get("threshold", 0.9),
                num_perm=params.get("num_perm", 128),
                num_part=params.get("num_part", 16),  #
                m=params.get("m", 8),
                weights=params.get("weights", (0.5, 0.5)),
                storage_config=params.get("storage_config", None),
                prepickle=params.get("prepickle", None))

        elif self.hash_type in {"HyperLogLog", "hyperloglog", "hll", "HLL"}:
            # 相关的接口与HyperLogLog 相同
            # HyperLogLog能够使用较小且固定的内存空间，单次估算数据集的基数（不同值的数量）
            self.hash = HyperLogLog(
                p=params.get("p", 8),  #  与MinHash 中的数据相比较，num_perm  用于控制精度
                reg=params.get("reg", None),
                hashfunc=params.get("hashfunc", sha1_hash32),  # 内部使用的hash 算法
                hashobj=params.get("hashobj",
                                   None))  # 可选 数组或列表，  使用hashfunc 代替了

        elif self.hash_type in {
                "hyperloglogplusplus", "HyperLogLogPlusPlus", "HyperLogLog++",
                "hyperlogkog++", "HLLPP", "hllpp", "HLL++", "hll++"
        }:
            # 相关的接口与HyperLogLog 相同
            self.hash = HyperLogLogPlusPlus(
                p=params.get("p", 8),
                reg=params.get("reg", None),
                hashfunc=params.get("hashfunc", sha1_hash64),  # 使用的64位的hash 算法
                hashobj=params.get("hashobj", None))

        else:
            raise ValueError("请选择正确的函数函数对象")

예제 #27

0

파일 보기

파일: document.py 프로젝트: nguyentuc/chatbot_v4_21-10

 def get_minhash(self, shingles, num_perm):
     mh = MinHash(num_perm=num_perm)
     for d in shingles:
         mh.update(d.encode('utf8'))
     return LeanMinHash(mh)

예제 #28

0

파일 보기

    def test_serialize(self):
        m1 = MinHash(2, 1, hashfunc=fake_hash_func)
        lm1 = LeanMinHash(m1)
        buf = bytearray(lm1.bytesize())
        # Only test for syntax
        lm1.serialize(buf)

        m2 = MinHash(2, 1, hashfunc=fake_hash_func)
        lm2 = LeanMinHash(m2)
        size = lm1.bytesize()
        buf = bytearray(size * 2)
        lm1.serialize(buf)
        lm2.serialize(buf[size:])

예제 #29

0

파일 보기

 def test_bytesize(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     lm1 = LeanMinHash(m1)
     self.assertTrue(lm1.bytesize() == (4 * 4) + 4 + 8)

예제 #30

0

파일 보기

 def test_is_empty(self):
     m = MinHash()
     lm = LeanMinHash(m)
     self.assertTrue(lm.is_empty())