def test_single_instance_created(): # set up faux_rng_1 = RNG.instance() faux_rng_2 = RNG.instance() # asserts assert faux_rng_1 is faux_rng_2
from operator import xor import heapq from lsh.utils.similarity import compute_positive_hash from lsh.utils.random_number_generator import RNG DEFAULT_NUM_RANDOM_NUMS = 200 #TODO should create a config to set this DEFAULT_BITS = 32 #TODO should create a config to set this # we want to use the same random numbers across all documents we check, that is why I'm using # a thread safe, singleton to generate my random numbers RANDOM_NUMBERS = RNG.instance(DEFAULT_NUM_RANDOM_NUMS, DEFAULT_BITS) def run(shingles_list): """ Generates minhash values for each shingle in the given list. :param shingles_list: shingles from one document (this list represents one document) :return: list of minhash values (long integers) for a given list of shingles (document) """ #basic minhash implementation algorithm steps... min_hash_values = [] if shingles_list: for shingle in shingles_list: # reset min-heap as each shingle should get it's own min-heap # to calculate the minimum hash value min_heap = [] for count in range(0, DEFAULT_NUM_RANDOM_NUMS): #step 1: calculate hash values for current shingle