class VisualMinHashWithDataSketch: """ minHash with sketches for near image duplicate detection. This is an implementation of minHash algorithm introduced in Scalable Near Identical Image and Shot Detection - Microsoft (https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/civr2007.pdf) by Ondrej Chum, James Philbin, Michael Isard, Andrew Zisserman """ # TODO: add word weighting on this minHash algorithm. def __init__(self, minHash_hash_num=512, minHash_param_k=512, minHash_param_s=3, rand_seed=0): # We could use minHash function as permutation of vocabulary. # However, it is memory inefficient. As an alternative, we can use hash function and take min value among the existing members. # TODO: This alternative may not work. Check this out. from datasketch import MinHash # In paper, sec 4.1, it says they use 512 independent hash function and grouped 512 sketches by usning hash function multiple times. # I think this is not valid implementation, because sketches are not indenpendent anymore. # Maybe that was compromise between mathmatical accuracy and speed. Caluclating 512*3 hash function is 3 times slower. # To reproduce the paper results, I may have to follow this implementation. # But, let me try correct implemenation first, which makes 512 sketches to be truly independent. self.minHash_hash_num = minHash_hash_num # indenpendent hash function. self.minHash_param_k = minHash_param_k # number of sketches self.minHash_param_s = minHash_param_s # tuple length, or sketch size np.random.seed(rand_seed) self.sketch_choices = [] for k in range(minHash_param_k): rand_choice_hashfunc = [] for s in range(minHash_param_s): rand_choice_hashfunc.append( np.random.randint(0, minHash_hash_num)) # print('choice:', rand_choice_hashfunc) self.sketch_choices.append(rand_choice_hashfunc) self.minHash = MinHash(num_perm=minHash_hash_num, seed=rand_seed) def hash_bow(self, target_set): # init minHashes self.minHash.clear() for elem in target_set: self.minHash.update_with_intval(elem) hashval = self.minHash.digest() # print('hashval:', hashval) result = [] for choice_indexes in self.sketch_choices: # print('choice_indexes:', choice_indexes) sketch = hashval[choice_indexes] # print('sketch:', sketch) result.append(tuple(sketch)) return result
class PradoProjector(Projector): def __init__( self, feature_length: int = None, config: Optional[PradoProjectorConfig] = None, ): super().__init__() if config is None: config = PradoProjectorConfig(feature_length=feature_length) self._config = copy.deepcopy(config) self._hashobj = MinHash(num_perm=self.n_permutations, hashfunc=farmhash.hash32) self._projection_operator = PradoProjectionOperator() self._vectorized_projection = np.vectorize(self.project, signature="()->(n)") # region Properties @property def feature_length(self) -> int: return self._config.feature_length @property def B(self) -> int: return self.feature_length @property def n_permutations(self) -> int: return (2 * self.B + 32 - 1) // 32 # endregion def project(self, x: str): self._hashobj.clear() self._hashobj.update(x) # (4 * n_permutations, ) token_as_bytes = b"".join( int(x).to_bytes(4, "big") for x in self._hashobj.digest()) # (32 * n_permutations, ) token_as_bits = bitarray.bitarray() token_as_bits.frombytes(token_as_bytes) # (2B, ) - MinHash can give us larger hashes than # we need. It is recommended to set B up so this # doesn't destroy/skip data. In other words, B should # be a multiplier of 16. return torch.tensor(token_as_bits[:2 * self.B], dtype=torch.float) def __call__(self, x: List) -> torch.Tensor: # Can be anything, (Any, N[str]) -> (Any, N, 2B) token_features = self._vectorized_projection(x) token_features = torch.tensor(token_features, dtype=torch.float) # (Any, N, 2B) -> (Any, N, B, 2) token_features = torch.reshape(token_features, (*token_features.shape[:-1], -1, 2)) # (Any, N, B, 2) -> (Any, N, B, 1) fingerprint = self._projection_operator(token_features) # (Any, N, B, 1) -> (Any, N, B) fingerprint = torch.squeeze(fingerprint, dim=-1) return fingerprint