예제 #1
0
def main() -> None:
    for _ in tqdm(range(1), desc="Create finding example:"):
        minhash = MinHash(num_perm=256)
        list_strings = []
        for _ in range(200):
            rand_string = ''.join(
                random.choice(string.ascii_lowercase) for i in range(5))
            list_strings.append(rand_string)
        minhash.update_batch([s.encode('utf-8') for s in list_strings])

    for _ in tqdm(range(1), desc="Connect to existing db:"):
        lsh = MinHashLSH(threshold=0.5,
                         num_perm=256,
                         storage_config={
                             'type': 'cassandra',
                             'basename': b'perftest',
                             'cassandra': {
                                 'seeds': ['127.0.0.1'],
                                 'keyspace': config.KEY_SPACE,
                                 'replication': {
                                     'class': 'SimpleStrategy',
                                     'replication_factor': '1',
                                 },
                                 'drop_keyspace': False,
                                 'drop_tables': False,
                             }
                         })

    try:
        for _ in tqdm(range(1), desc="Find minHash similarity:"):
            result = lsh.query(minhash)
        print("Approximate neighbours with Jaccard similarity > 0.5", result)
    except BaseException as e:
        print(str(e))
        print("Error")
예제 #2
0
 def search(self,
            query: List[int]) -> Union[List[int], List[Tuple[int, float]]]:
     h = MinHash(num_perm=self._lsh.h, hashfunc=hash)
     h.update_batch(query)
     found = self._lsh.query(h)
     if self._threshold is not None:
         threshold = self._threshold
         fps = self._fingerprints
         bm = BitMap(query)
         return sorted(((x, j) for x in found
                        if (j := bm.jaccard_index(fps[x])) >= threshold),
                       key=itemgetter(1),
                       reverse=True)
     return found
예제 #3
0
def main() -> None:
    minhashes = []
    files = []
    for iterator in tqdm(range(config.COUNT_UNQ_MHS),
                         desc="Generate minHashes:"):
        minhash = MinHash(num_perm=256)
        file = []
        for _ in range(200):
            rand_string = ''.join(
                random.choice(string.ascii_lowercase) for _ in range(5))
            file.append(rand_string)
        files.append(file)
        minhash.update_batch([s.encode('utf-8') for s in file])
        minhashes.append(("key" + str(iterator), minhash))

    lsh = MinHashLSH(threshold=0.5,
                     num_perm=256,
                     storage_config={
                         'type': 'cassandra',
                         'basename': b'perftest',
                         'cassandra': {
                             'seeds': ['127.0.0.1'],
                             'keyspace': config.KEY_SPACE,
                             'replication': {
                                 'class': 'SimpleStrategy',
                                 'replication_factor': '1',
                             },
                             'drop_keyspace': False,
                             'drop_tables': False,
                         }
                     })

    for _ in tqdm(range(1), desc="Insert 100 minHashes:"):
        with lsh.insertion_session(buffer_size=100) as session:
            for key, minhash in minhashes:
                session.insert(key, minhash)

    f_disc_mhs = open('minhashes.txt', 'w+')
    for minhash in tqdm(minhashes, desc="Log minHashes:"):
        log(f_disc_mhs, minhash[0], minhash[1].digest())
    f_disc_mhs.close()

    f_disc_files = open('files.txt', 'w+')
    for iterator in tqdm(range(len(files)), desc="Log files:"):
        log(f_disc_files, minhashes[iterator][0], files[iterator])
    f_disc_mhs.close()
예제 #4
0
def get_minhash(args):
    (n, x), num_perm = args
    h = MinHash(num_perm=num_perm, hashfunc=hash)
    h.update_batch(x)
    return n, h
예제 #5
0
from .matrix_utils import split, split_model, dedup_blocks

vgg16 = tf.keras.applications.VGG16()
vgg19 = tf.keras.applications.VGG19()

s1 = split_model(vgg16, 500, 500, 32)
s2 = split_model(vgg19, 500, 500, 32)

num_perm = 128
min_dict = {}

pbar = tqdm(total=len(s1))
for i, val in enumerate(s1):
    m = MinHash(num_perm=num_perm)
    m.update_batch(np.floor(val.flatten() * 10000).astype(int))
    min_dict[f"s1-{i}"] = m
    pbar.update(1)
#    if i > 20:
#        break

pbar = tqdm(total=len(s2))
for i, val in enumerate(s2):
    m = MinHash(num_perm=num_perm)
    m.update_batch(np.floor(val.flatten() * 10000).astype(int))
    min_dict[f"s2-{i}"] = m
    pbar.update(1)
#    if i > 20:
#        break

lsh2 = MinHashLSH(threshold=0.9, num_perm=num_perm)