예제 #1
0
class VisualMinHashWithDataSketch:
    """
    minHash with sketches for near image duplicate detection.
    This is an implementation of minHash algorithm introduced in
    Scalable Near Identical Image and Shot Detection - Microsoft (https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/civr2007.pdf)
    by Ondrej Chum, James Philbin, Michael Isard, Andrew Zisserman
    """

    # TODO: add word weighting on this minHash algorithm.

    def __init__(self,
                 minHash_hash_num=512,
                 minHash_param_k=512,
                 minHash_param_s=3,
                 rand_seed=0):
        # We could use minHash function as permutation of vocabulary.
        # However, it is memory inefficient. As an alternative, we can use hash function and take min value among the existing members.
        # TODO: This alternative may not work. Check this out.
        from datasketch import MinHash

        # In paper, sec 4.1, it says they use 512 independent hash function and grouped 512 sketches by usning hash function multiple times.
        # I think this is not valid implementation, because sketches are not indenpendent anymore.
        # Maybe that was compromise between mathmatical accuracy and speed. Caluclating 512*3 hash function is 3 times slower.
        # To reproduce the paper results, I may have to follow this implementation.
        # But, let me try correct implemenation first, which makes 512 sketches to be truly independent.
        self.minHash_hash_num = minHash_hash_num  # indenpendent hash function.
        self.minHash_param_k = minHash_param_k  # number of sketches
        self.minHash_param_s = minHash_param_s  # tuple length, or sketch size

        np.random.seed(rand_seed)
        self.sketch_choices = []
        for k in range(minHash_param_k):
            rand_choice_hashfunc = []
            for s in range(minHash_param_s):
                rand_choice_hashfunc.append(
                    np.random.randint(0, minHash_hash_num))
            # print('choice:', rand_choice_hashfunc)

            self.sketch_choices.append(rand_choice_hashfunc)

        self.minHash = MinHash(num_perm=minHash_hash_num, seed=rand_seed)

    def hash_bow(self, target_set):
        # init minHashes
        self.minHash.clear()

        for elem in target_set:
            self.minHash.update_with_intval(elem)

        hashval = self.minHash.digest()
        # print('hashval:', hashval)

        result = []
        for choice_indexes in self.sketch_choices:
            # print('choice_indexes:', choice_indexes)
            sketch = hashval[choice_indexes]
            # print('sketch:', sketch)
            result.append(tuple(sketch))
        return result
예제 #2
0
class PradoProjector(Projector):
    def __init__(
        self,
        feature_length: int = None,
        config: Optional[PradoProjectorConfig] = None,
    ):
        super().__init__()

        if config is None:
            config = PradoProjectorConfig(feature_length=feature_length)

        self._config = copy.deepcopy(config)
        self._hashobj = MinHash(num_perm=self.n_permutations,
                                hashfunc=farmhash.hash32)
        self._projection_operator = PradoProjectionOperator()

        self._vectorized_projection = np.vectorize(self.project,
                                                   signature="()->(n)")

    # region Properties
    @property
    def feature_length(self) -> int:
        return self._config.feature_length

    @property
    def B(self) -> int:
        return self.feature_length

    @property
    def n_permutations(self) -> int:
        return (2 * self.B + 32 - 1) // 32

    # endregion

    def project(self, x: str):
        self._hashobj.clear()
        self._hashobj.update(x)

        # (4 * n_permutations, )
        token_as_bytes = b"".join(
            int(x).to_bytes(4, "big") for x in self._hashobj.digest())

        # (32 * n_permutations, )
        token_as_bits = bitarray.bitarray()
        token_as_bits.frombytes(token_as_bytes)

        # (2B, ) - MinHash can give us larger hashes than
        # we need. It is recommended to set B up so this
        # doesn't destroy/skip data. In other words, B should
        # be a multiplier of 16.
        return torch.tensor(token_as_bits[:2 * self.B], dtype=torch.float)

    def __call__(self, x: List) -> torch.Tensor:
        # Can be anything, (Any, N[str]) -> (Any, N, 2B)
        token_features = self._vectorized_projection(x)
        token_features = torch.tensor(token_features, dtype=torch.float)

        # (Any, N, 2B) -> (Any, N, B, 2)
        token_features = torch.reshape(token_features,
                                       (*token_features.shape[:-1], -1, 2))

        # (Any, N, B, 2) -> (Any, N, B, 1)
        fingerprint = self._projection_operator(token_features)

        # (Any, N, B, 1) -> (Any, N, B)
        fingerprint = torch.squeeze(fingerprint, dim=-1)

        return fingerprint