def __init__(self, hash_name): """ Just keeps the name. """ super(HashPermutations, self).__init__(hash_name) self.permutation = Permutation() self.child_hashes = [] self.dim = None
class HashPermutations(LSHash): """ By: Xing Shi ([email protected]) This meta-hash performs hash permutations on binary bucket keys. You use this just like every other LSHash implementation and add the actual binary hashes you want to use via the add_child_hash method. Each child hash will be used separatly. After all vectors have been indexed you have to call build_permuted_index to generate the permuted index. So to use this you have to do the following steps: 1. Create HashPermutations instance and use it in the Engine constructor 2. Add your binary hashes as child hashes by calling add_child_hash() 3. Store your vectors using the Engine 4. Build the permuted index by calling build_permuted_index() 5. Now when you query the permuted index is used If you are adding more vectors afterwards you can update the permuted index by calling build_permuted_index() again and again. """ def __init__(self, hash_name): """ Just keeps the name. """ super(HashPermutations, self).__init__(hash_name) self.permutation = Permutation() self.child_hashes = [] self.dim = None def reset(self, dim): """ Resets / Initializes the hash for the specified dimension. """ self.dim = dim # Reset all child hashes for child_hash in self.child_hashes: child_hash['hash'].reset(dim) child_hash['bucket_keys'] = {} def hash_vector(self, v, querying=False): """ Hashes the vector and returns the bucket key as string. """ bucket_keys = [] if querying: # If we are querying, use the permuted indexes to get bucket keys for child_hash in self.child_hashes: lshash = child_hash['hash'] # Make sure the permuted index for this hash is existing if not lshash.hash_name in self.permutation.permutedIndexs: raise AttributeError('Permuted index is not existing for hash with name %s' % lshash.hash_name) # Get regular bucket keys from hash for bucket_key in lshash.hash_vector(v, querying): #print 'Regular bucket key %s' % bucket_key # Get neighbour keys from permuted index neighbour_keys = self.permutation.get_neighbour_keys(lshash.hash_name,bucket_key) # Add them to result, but prefix with hash name for n in neighbour_keys: bucket_keys.append(lshash.hash_name+'_'+n) else: # If we are indexing (storing) just use child hashes without permuted index for child_hash in self.child_hashes: lshash = child_hash['hash'] # Get regular bucket keys from hash for bucket_key in lshash.hash_vector(v, querying): # Register bucket key in child hash dict child_hash['bucket_keys'][bucket_key] = bucket_key # Append bucket key to result prefixed with child hash name bucket_keys.append(lshash.hash_name+'_'+bucket_key) # Return all the bucket keys return bucket_keys def get_config(self): """ Returns pickle-serializable configuration struct for storage. """ return { 'hash_name': self.hash_name, 'dim': self.dim } def apply_config(self, config): """ Applies config """ self.hash_name = config['hash_name'] self.dim = config['dim'] def add_child_hash(self, child_hash, permute_config): """ Adds specified child hash with specified configuration. The hash must be one of the binary types. permute_config is a dict in the following format: permute_config = { "num_permutation":50, "beam_size":10, "num_neighbour":100 } """ # Hash must generate binary keys if not (isinstance(child_hash,PCABinaryProjections) or isinstance(child_hash,RandomBinaryProjections) or isinstance(child_hash,RandomBinaryProjectionTree)): raise ValueError('Child hashes must generate binary keys') # Add both hash and config to array of child hashes. Also we are going to # accumulate used bucket keys for every hash in order to build the permuted index self.child_hashes.append({'hash': child_hash, 'config': permute_config, 'bucket_keys': {}}) def build_permuted_index(self): """ Build PermutedIndex for all your binary hashings. PermutedIndex would be used to find the neighbour bucket key in terms of Hamming distance. Permute_configs is nested dict in the following format: permuted_config = {"<hash_name>": { "num_permutation":50, "beam_size":10, "num_neighbour":100 } } """ for child_hash in self.child_hashes: # Get config values for child hash config = child_hash['config'] num_permutation = config['num_permutation'] beam_size = config['beam_size'] num_neighbour = config['num_neighbour'] # Get used buckets keys for child hash bucket_keys = child_hash['bucket_keys'].keys() # Get actual child hash lshash = child_hash['hash'] # Compute permuted index for this hash self.permutation.build_permuted_index(lshash,bucket_keys,num_permutation,beam_size,num_neighbour)