def test_single_instance_created():
    # set up
    faux_rng_1 = RNG.instance()
    faux_rng_2 = RNG.instance()

    # asserts
    assert faux_rng_1 is faux_rng_2
예제 #2
0
def test_single_instance_created():
    # set up
    faux_rng_1 = RNG.instance()
    faux_rng_2 = RNG.instance()

    # asserts
    assert faux_rng_1 is faux_rng_2
예제 #3
0
from operator import xor
import heapq
from lsh.utils.similarity import compute_positive_hash

from lsh.utils.random_number_generator import RNG

DEFAULT_NUM_RANDOM_NUMS = 200 #TODO should create a config to set this
DEFAULT_BITS = 32 #TODO should create a config to set this

# we want to use the same random numbers across all documents we check, that is why I'm using
# a thread safe, singleton to generate my random numbers
RANDOM_NUMBERS = RNG.instance(DEFAULT_NUM_RANDOM_NUMS, DEFAULT_BITS)

def run(shingles_list):
    """
    Generates minhash values for each shingle in the given list.
    :param shingles_list: shingles from one document (this list represents one document)
    :return: list of minhash values (long integers) for a given list of shingles (document)
    """

    #basic minhash implementation algorithm steps...
    min_hash_values = []

    if shingles_list:
        for shingle in shingles_list:
            # reset min-heap as each shingle should get it's own min-heap
            # to calculate the minimum hash value
            min_heap = []

            for count in range(0, DEFAULT_NUM_RANDOM_NUMS):
                #step 1: calculate hash values for current shingle